diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile index 7f154abc8c64f150eb2a81887c2f2a4cb8227433..fb23f70f66e85a77ae057c4d2edb989097723987 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/Makefile @@ -40,6 +40,11 @@ CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/ VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -63,14 +68,20 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/run_data/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/run_data/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..7c5d66889d07891d1c53b84e9ba29b3876c841e3 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/data/run_data/out-run-1 @@ -0,0 +1,40506 @@ +size_in_bytes = 92928 +DEBUG: ***--- size_in_bytes = 92928 +DEBUG: Attempting to Allocate = 92928 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 363, cStride = 121, hStride = 11, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1228800 +DEBUG: ***--- size_in_bytes = 1228800 +DEBUG: Attempting to Allocate = 1228800 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 1600, cStride = 25, hStride = 5, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 768 +DEBUG: ***--- size_in_bytes = 768 +DEBUG: Attempting to Allocate = 768 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 192, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2654208 +DEBUG: ***--- size_in_bytes = 2654208 +DEBUG: Attempting to Allocate = 2654208 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 1728, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1536 +DEBUG: ***--- size_in_bytes = 1536 +DEBUG: Attempting to Allocate = 1536 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 384, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 3538944 +DEBUG: ***--- size_in_bytes = 3538944 +DEBUG: Attempting to Allocate = 3538944 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3456, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2359296 +DEBUG: ***--- size_in_bytes = 2359296 +DEBUG: Attempting to Allocate = 2359296 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2304, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 163840 +DEBUG: ***--- size_in_bytes = 163840 +DEBUG: Attempting to Allocate = 163840 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 40960, cStride = 40960, hStride = 10, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 40 +DEBUG: ***--- size_in_bytes = 40 +DEBUG: Attempting to Allocate = 40 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INITIALIZING GPU 0 +CREATED HANDLES 0 +INFO: +WARNING: File 'opentuner_flags' not found + + +initializing tuner .... +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +Read PROMISE FLAGS 0 +DONE INTIALIZING GPU 0 +INFO: Reading Quantization Ranges File... +INFO: DONE. +INFO: Reading Configuration File... +DEBUG: first_line: 2000 +DEBUG: Baseline time: 2000.000000 + +DEBUG: line: +++++ +DEBUG: t: +++++ +DEBUG: +DEBUG: line: conf1 3.86 0 79.1 0.0 +DEBUG: t: conf1 +DEBUG: t: 3.86 +DEBUG: t: 0 +DEBUG: t: 79.1 +DEBUG: t: 0.0 +DEBUG: +DEBUG: line: 1 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +DEBUG: t: 1 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 1 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +DEBUG: t: 2 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 5 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 3 gpu conv fp32 1 add fp32 1 tanh fp32 1 +DEBUG: t: 3 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 9 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 4 gpu conv fp32 1 add fp32 1 tanh fp32 1 +DEBUG: t: 4 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 12 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 5 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +DEBUG: t: 5 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 15 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 6 gpu mul fp32 1 add fp32 1 +DEBUG: t: 6 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 19 + +DEBUG: Found mul operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 7 gpu softmax fp32 1 +DEBUG: t: 7 +DEBUG: t: gpu +DEBUG: t: softmax +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 21 + +DEBUG: Found softmax operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: ----- +DEBUG: t: ----- +DEBUG: +DEBUG: DONE. +INFO: Sorting autotuner configurations... +INFO: Done sorting. +INFO: Speedup Configurations ++++++ +conf1 3.860000 0.000000 79.099998 0.000000 +1 : gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +2 : gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 : gpu conv fp32 1 add fp32 1 tanh fp32 1 +4 : gpu conv fp32 1 add fp32 1 tanh fp32 1 +5 : gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +6 : gpu mul fp32 1 add fp32 1 +7 : gpu softmax fp32 1 +----- +DEBUG: slowdowns file not found. Initializing slowdowns randomly. +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +WARNING: pause_profiler was already called +Initializing policy object ... +DONE: Initializing policy object. +Select target device (0 for CPU, 1 fpr GPU): DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +INFO: Moving 92928 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.295738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.296727 +INFO: TimeDuration, Event = Add_end, Time = 0.000990 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.296762 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.297676 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000914 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.297709 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.301521 +INFO: TimeDuration, Event = Pool_end, Time = 0.003812 +DEBUG: No data movement required - Data on Device +INFO: Moving 1228800 bytes from host to GPU +INFO: Moving 768 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.336723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.337521 +INFO: TimeDuration, Event = Add_end, Time = 0.000797 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.337549 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.338205 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000656 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.338230 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.341296 +INFO: TimeDuration, Event = Pool_end, Time = 0.003065 +DEBUG: No data movement required - Data on Device +INFO: Moving 2654208 bytes from host to GPU +INFO: Moving 1536 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.368838 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.369336 +INFO: TimeDuration, Event = Add_end, Time = 0.000498 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.369364 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.369713 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000349 +DEBUG: No data movement required - Data on Device +INFO: Moving 3538944 bytes from host to GPU +INFO: Moving 1024 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.387317 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.387641 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.387659 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.387893 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +INFO: Moving 2359296 bytes from host to GPU +INFO: Moving 1024 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.409175 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.409515 +INFO: TimeDuration, Event = Add_end, Time = 0.000340 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.409542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.409789 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000247 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.409824 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.412476 +INFO: TimeDuration, Event = Pool_end, Time = 0.002652 +DEBUG: No data movement required - Data on Device +INFO: Moving 163840 bytes from host to GPU +INFO: Moving 40 bytes from host to GPU +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352752.415461 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352752.415614 +INFO: TimeDuration, Event = Mul_end, Time = 0.000153 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.415640 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.415726 +INFO: TimeDuration, Event = Add_end, Time = 0.000086 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352752.415753 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352752.415827 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000073 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 106.693921, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.511666 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.512602 +INFO: TimeDuration, Event = Add_end, Time = 0.000935 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.512719 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.513583 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.513614 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.517411 +INFO: TimeDuration, Event = Pool_end, Time = 0.003797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.549701 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.550438 +INFO: TimeDuration, Event = Add_end, Time = 0.000736 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.550466 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.551137 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000671 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.551164 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.554256 +INFO: TimeDuration, Event = Pool_end, Time = 0.003092 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.574404 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.574904 +INFO: TimeDuration, Event = Add_end, Time = 0.000500 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.574933 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.575282 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.590695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.591018 +INFO: TimeDuration, Event = Add_end, Time = 0.000324 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.591035 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.591267 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.606840 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.607173 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.607200 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.607445 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000245 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.607481 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.610131 +INFO: TimeDuration, Event = Pool_end, Time = 0.002650 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352752.610166 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352752.610304 +INFO: TimeDuration, Event = Mul_end, Time = 0.000138 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.610329 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.610363 +INFO: TimeDuration, Event = Add_end, Time = 0.000034 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352752.610389 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352752.610457 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000068 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 114.706602, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.705257 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.706188 +INFO: TimeDuration, Event = Add_end, Time = 0.000931 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.706206 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.707067 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.707086 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.711019 +INFO: TimeDuration, Event = Pool_end, Time = 0.003933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.739861 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.740626 +INFO: TimeDuration, Event = Add_end, Time = 0.000766 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.740641 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.741344 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000703 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.741361 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.744454 +INFO: TimeDuration, Event = Pool_end, Time = 0.003093 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.762705 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.763182 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.763200 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.763538 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.777148 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.777472 +INFO: TimeDuration, Event = Add_end, Time = 0.000324 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.777489 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.777724 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.792358 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.792678 +INFO: TimeDuration, Event = Add_end, Time = 0.000320 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.792694 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.792930 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000236 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.792951 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.795635 +INFO: TimeDuration, Event = Pool_end, Time = 0.002685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352752.795678 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352752.795820 +INFO: TimeDuration, Event = Mul_end, Time = 0.000142 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.795838 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.795875 +INFO: TimeDuration, Event = Add_end, Time = 0.000036 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352752.795892 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352752.796044 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000151 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 101.235232, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.887885 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.888841 +INFO: TimeDuration, Event = Add_end, Time = 0.000957 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.888875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.889753 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000877 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.889766 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.893637 +INFO: TimeDuration, Event = Pool_end, Time = 0.003870 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.920285 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.921065 +INFO: TimeDuration, Event = Add_end, Time = 0.000779 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.921233 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.921897 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000664 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.922060 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.925550 +INFO: TimeDuration, Event = Pool_end, Time = 0.003489 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.943443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.943920 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.943934 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.944273 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000339 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.957198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.957520 +INFO: TimeDuration, Event = Add_end, Time = 0.000322 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.957534 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.957766 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.971937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.972254 +INFO: TimeDuration, Event = Add_end, Time = 0.000318 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352752.972268 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352752.972812 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000545 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352752.972837 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352752.976310 +INFO: TimeDuration, Event = Pool_end, Time = 0.003473 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352752.976609 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352752.976722 +INFO: TimeDuration, Event = Mul_end, Time = 0.000113 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352752.976737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352752.976762 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352752.976778 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352752.976826 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.809286, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.065149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.066083 +INFO: TimeDuration, Event = Add_end, Time = 0.000934 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.066100 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.066967 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000868 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.066981 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.070576 +INFO: TimeDuration, Event = Pool_end, Time = 0.003595 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.096465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.097196 +INFO: TimeDuration, Event = Add_end, Time = 0.000730 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.097210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.097857 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.097870 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.101058 +INFO: TimeDuration, Event = Pool_end, Time = 0.003188 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.118896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.119383 +INFO: TimeDuration, Event = Add_end, Time = 0.000487 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.119396 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.119737 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.132679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.132997 +INFO: TimeDuration, Event = Add_end, Time = 0.000318 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.133011 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.133242 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.148104 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.148415 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.148679 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.148909 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.148928 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.151397 +INFO: TimeDuration, Event = Pool_end, Time = 0.002469 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352753.151418 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352753.151522 +INFO: TimeDuration, Event = Mul_end, Time = 0.000104 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.151535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.151558 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352753.151572 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352753.151618 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000047 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.754588, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.245789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.246717 +INFO: TimeDuration, Event = Add_end, Time = 0.000929 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.246734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.247597 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.247611 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.250518 +INFO: TimeDuration, Event = Pool_end, Time = 0.002907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.277401 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.278123 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.278136 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.278782 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.278794 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.281994 +INFO: TimeDuration, Event = Pool_end, Time = 0.003200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.299856 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.300328 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.300594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.300928 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.313234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.313542 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.313554 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.313782 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.327964 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.328268 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.328281 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.328510 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.328524 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.331242 +INFO: TimeDuration, Event = Pool_end, Time = 0.002718 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352753.331260 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352753.331357 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.331370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.331391 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352753.331403 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352753.331446 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.909273, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.422687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.423625 +INFO: TimeDuration, Event = Add_end, Time = 0.000938 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.423644 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.424514 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.424529 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.427408 +INFO: TimeDuration, Event = Pool_end, Time = 0.002879 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.454669 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.455394 +INFO: TimeDuration, Event = Add_end, Time = 0.000726 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.455412 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.456062 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.456078 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.459262 +INFO: TimeDuration, Event = Pool_end, Time = 0.003184 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.477217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.477690 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.477708 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.478046 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.490907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.491223 +INFO: TimeDuration, Event = Add_end, Time = 0.000316 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.491239 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.491472 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.505770 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.506079 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.506095 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.506332 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.506353 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.509065 +INFO: TimeDuration, Event = Pool_end, Time = 0.002713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352753.509091 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352753.509199 +INFO: TimeDuration, Event = Mul_end, Time = 0.000109 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.509217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.509243 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352753.509260 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352753.509314 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000054 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.303429, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.603297 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.604227 +INFO: TimeDuration, Event = Add_end, Time = 0.000930 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.604246 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.605117 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000871 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.605137 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.608025 +INFO: TimeDuration, Event = Pool_end, Time = 0.002888 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.635294 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.636017 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.636034 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.636686 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.636704 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.639870 +INFO: TimeDuration, Event = Pool_end, Time = 0.003167 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.657818 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.658287 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.658303 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.658640 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.671528 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.671844 +INFO: TimeDuration, Event = Add_end, Time = 0.000316 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.671861 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.672093 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.686390 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.686698 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.686714 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.686948 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.686969 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.689687 +INFO: TimeDuration, Event = Pool_end, Time = 0.002718 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352753.689712 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352753.689820 +INFO: TimeDuration, Event = Mul_end, Time = 0.000108 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.689836 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.689862 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352753.689879 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352753.689960 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.346969, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.794679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.795655 +INFO: TimeDuration, Event = Add_end, Time = 0.000976 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.795675 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.796557 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000882 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.796602 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.800610 +INFO: TimeDuration, Event = Pool_end, Time = 0.004008 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.826689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.827437 +INFO: TimeDuration, Event = Add_end, Time = 0.000748 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.827455 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.828109 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000654 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.828125 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.831268 +INFO: TimeDuration, Event = Pool_end, Time = 0.003143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.849227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.849699 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.849715 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.850049 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.862950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.863262 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.863278 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.863510 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.877841 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.878150 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.878168 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.878401 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.878422 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.881142 +INFO: TimeDuration, Event = Pool_end, Time = 0.002720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352753.881167 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352753.881274 +INFO: TimeDuration, Event = Mul_end, Time = 0.000107 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.881290 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.881345 +INFO: TimeDuration, Event = Add_end, Time = 0.000055 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352753.881363 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352753.881433 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000070 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.107548, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352753.974971 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352753.975903 +INFO: TimeDuration, Event = Add_end, Time = 0.000931 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352753.975921 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352753.976792 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000871 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352753.976813 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352753.979693 +INFO: TimeDuration, Event = Pool_end, Time = 0.002880 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.006940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.007665 +INFO: TimeDuration, Event = Add_end, Time = 0.000725 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.007682 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.008330 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.008343 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.011535 +INFO: TimeDuration, Event = Pool_end, Time = 0.003192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.029493 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.029961 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.029979 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.030314 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.043182 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.043496 +INFO: TimeDuration, Event = Add_end, Time = 0.000314 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.043512 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.043744 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.058159 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.058471 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.058488 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.058721 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.058742 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.061458 +INFO: TimeDuration, Event = Pool_end, Time = 0.002715 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.061484 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.061591 +INFO: TimeDuration, Event = Mul_end, Time = 0.000108 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.061609 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.061634 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.061651 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.061702 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.157637, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.107636 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.108568 +INFO: TimeDuration, Event = Add_end, Time = 0.000932 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.108585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.109452 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.109470 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.112363 +INFO: TimeDuration, Event = Pool_end, Time = 0.002893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.139655 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.140381 +INFO: TimeDuration, Event = Add_end, Time = 0.000726 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.140681 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.141347 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000666 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.141375 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.160179 +INFO: TimeDuration, Event = Pool_end, Time = 0.018804 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.167049 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.167517 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.167534 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.167875 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.180880 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.181194 +INFO: TimeDuration, Event = Add_end, Time = 0.000314 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.181210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.181442 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.195792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.196101 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.196117 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.196350 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.196654 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.200143 +INFO: TimeDuration, Event = Pool_end, Time = 0.003489 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.200169 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.200280 +INFO: TimeDuration, Event = Mul_end, Time = 0.000111 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.200297 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.200642 +INFO: TimeDuration, Event = Add_end, Time = 0.000345 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.200662 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.200715 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000053 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 102.664774, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.245472 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.246401 +INFO: TimeDuration, Event = Add_end, Time = 0.000929 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.246421 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.247286 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.247303 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.250195 +INFO: TimeDuration, Event = Pool_end, Time = 0.002892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.277832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.278560 +INFO: TimeDuration, Event = Add_end, Time = 0.000728 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.278577 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.279231 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000654 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.279249 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.282416 +INFO: TimeDuration, Event = Pool_end, Time = 0.003167 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.300418 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.300888 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.300906 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.301247 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.314229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.314541 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.314559 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.314791 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.329134 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.329443 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.329460 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.329695 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.329715 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.332432 +INFO: TimeDuration, Event = Pool_end, Time = 0.002716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.332452 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.332562 +INFO: TimeDuration, Event = Mul_end, Time = 0.000110 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.332575 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.332599 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.332619 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.332673 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000053 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.350046, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.375488 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.376413 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.376426 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.377291 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.377307 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.380205 +INFO: TimeDuration, Event = Pool_end, Time = 0.002898 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.407458 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.408177 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.408190 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.408839 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.408852 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.412049 +INFO: TimeDuration, Event = Pool_end, Time = 0.003197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.429847 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.430312 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.430325 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.430660 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.443295 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.443605 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.443618 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.443848 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.458026 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.458329 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.458342 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.458572 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.458588 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.461320 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.461341 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.461437 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.461452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.461473 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.461487 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.461529 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.552008, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.503698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.504626 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.504643 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.505512 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.505526 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.508422 +INFO: TimeDuration, Event = Pool_end, Time = 0.002896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.535646 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.536364 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.536374 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.537024 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.537035 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.540240 +INFO: TimeDuration, Event = Pool_end, Time = 0.003205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.559030 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.559497 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.559510 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.559849 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.572552 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.572861 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.572874 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.573104 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.587285 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.587589 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.587600 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.587830 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.587845 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.590578 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.590598 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.590695 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.590707 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.590728 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.590743 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.590785 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.614554, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.634462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.635377 +INFO: TimeDuration, Event = Add_end, Time = 0.000915 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.635391 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.636252 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.636267 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.639200 +INFO: TimeDuration, Event = Pool_end, Time = 0.002934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.666482 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.667205 +INFO: TimeDuration, Event = Add_end, Time = 0.000724 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.667219 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.667867 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.667879 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.671074 +INFO: TimeDuration, Event = Pool_end, Time = 0.003195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.688862 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.689326 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.689340 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.689672 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.702370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.702679 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.702691 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.702922 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.717103 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.717406 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.717420 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.717649 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.717664 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.720394 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.720596 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.720694 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.720708 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.720731 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.720744 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.720795 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.872673, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.763242 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.764163 +INFO: TimeDuration, Event = Add_end, Time = 0.000920 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.764179 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.765044 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.765059 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.767964 +INFO: TimeDuration, Event = Pool_end, Time = 0.002905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.795206 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.795925 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.795938 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.796635 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000697 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.796652 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.799809 +INFO: TimeDuration, Event = Pool_end, Time = 0.003156 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.817599 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.818066 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.818088 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.818431 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000343 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.831092 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.831399 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.831412 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.831639 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.846893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.847198 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.847212 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.847441 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.847459 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.850184 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.850204 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.850302 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.850344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.850368 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.850382 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.850425 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.703147, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.893594 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.894523 +INFO: TimeDuration, Event = Add_end, Time = 0.000929 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.894539 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.895404 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.895420 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.898329 +INFO: TimeDuration, Event = Pool_end, Time = 0.002909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.925905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.926626 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.926639 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.927291 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000652 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.927305 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.930479 +INFO: TimeDuration, Event = Pool_end, Time = 0.003175 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.948265 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.948731 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.948748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.949082 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.961782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.962090 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.962102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.962331 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.976549 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.976854 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352754.976868 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352754.977100 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352754.977115 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352754.979838 +INFO: TimeDuration, Event = Pool_end, Time = 0.002723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352754.979857 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352754.979953 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352754.979965 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352754.979987 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352754.980000 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352754.980047 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.113452, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.022861 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.023787 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.023803 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.024666 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.024681 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.027585 +INFO: TimeDuration, Event = Pool_end, Time = 0.002903 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.054901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.055621 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.055635 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.056284 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.056297 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.059505 +INFO: TimeDuration, Event = Pool_end, Time = 0.003208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.077280 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.077746 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.077759 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.078098 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000339 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.090788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.091097 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.091110 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.091339 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.105533 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.105838 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.105852 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.106081 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.106098 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.108828 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352755.108847 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352755.108944 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.108957 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.108978 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352755.108993 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352755.109042 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.802193, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.153442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.154370 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.154384 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.155265 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000881 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.155277 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.158165 +INFO: TimeDuration, Event = Pool_end, Time = 0.002887 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.185436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.186159 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.186173 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.186824 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.186834 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.190009 +INFO: TimeDuration, Event = Pool_end, Time = 0.003174 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.207785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.208249 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.208262 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.208597 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.221270 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.221578 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.221592 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.221821 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.235981 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.236283 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.236295 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.236542 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000247 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.236594 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.239274 +INFO: TimeDuration, Event = Pool_end, Time = 0.002680 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352755.239293 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352755.239391 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.239403 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.239425 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352755.239457 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352755.239501 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.680308, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.281682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.282598 +INFO: TimeDuration, Event = Add_end, Time = 0.000916 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.282615 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.283479 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.283495 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.286403 +INFO: TimeDuration, Event = Pool_end, Time = 0.002908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.313657 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.314373 +INFO: TimeDuration, Event = Add_end, Time = 0.000717 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.314387 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.315037 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.315053 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.318249 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.336034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.336499 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.336586 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.336919 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.349543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.349850 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.349863 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.350092 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.364238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.364540 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.364551 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.364780 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.364797 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.367534 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352755.367553 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352755.367650 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.367661 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.367682 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352755.367696 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352755.367747 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.674716, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.410305 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.411231 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.411262 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.412155 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000893 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.412169 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.415028 +INFO: TimeDuration, Event = Pool_end, Time = 0.002860 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.442257 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.442975 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.442988 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.443637 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.443650 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.446852 +INFO: TimeDuration, Event = Pool_end, Time = 0.003202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.464637 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.465101 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.465113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.465448 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.478136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.478444 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.478457 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.478688 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.492910 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.493212 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.493226 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.493456 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.493473 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.496203 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352755.496222 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352755.496319 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.496330 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.496353 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352755.496363 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352755.496412 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.638865, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.539391 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.540312 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.540629 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.541492 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.541506 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.545209 +INFO: TimeDuration, Event = Pool_end, Time = 0.003703 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.571374 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.572095 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.572108 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.572756 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.572773 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.575963 +INFO: TimeDuration, Event = Pool_end, Time = 0.003190 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.593751 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.594217 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.594231 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.594568 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.607208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.607516 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.607530 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.607756 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.621935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.622240 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.622253 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.622483 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.622500 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.625230 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352755.625250 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352755.625346 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.625361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.625382 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352755.625396 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352755.625446 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.572386, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.667520 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.668447 +INFO: TimeDuration, Event = Add_end, Time = 0.000927 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.668617 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.669481 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.669497 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.673294 +INFO: TimeDuration, Event = Pool_end, Time = 0.003797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.699497 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.700214 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.700229 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.700881 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000653 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.700894 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.704087 +INFO: TimeDuration, Event = Pool_end, Time = 0.003192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.721865 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.722330 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.722344 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.722679 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.735312 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.735619 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.735631 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.735862 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.750042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.750346 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.750359 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.750588 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.750603 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.753334 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352755.753354 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352755.753451 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.753464 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.753485 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352755.753498 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352755.753549 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.544899, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.795452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.796384 +INFO: TimeDuration, Event = Add_end, Time = 0.000931 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.796610 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.797476 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.797492 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.801223 +INFO: TimeDuration, Event = Pool_end, Time = 0.003731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.827410 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.828130 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.828143 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.828793 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.828806 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.831997 +INFO: TimeDuration, Event = Pool_end, Time = 0.003191 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.849770 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.850234 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.850247 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.850581 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.863235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.863543 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.863555 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.863788 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.877959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.878263 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.878275 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.878506 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.878522 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.881254 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352755.881274 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352755.881370 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.881383 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.881404 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352755.881417 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352755.881466 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.313797, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.923658 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.924583 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.924609 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.925472 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.925486 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.928629 +INFO: TimeDuration, Event = Pool_end, Time = 0.003143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.955635 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.956352 +INFO: TimeDuration, Event = Add_end, Time = 0.000717 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.956363 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.957009 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352755.957021 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352755.960224 +INFO: TimeDuration, Event = Pool_end, Time = 0.003203 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.978006 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.978473 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.978488 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.978822 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352755.991450 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352755.991757 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352755.991770 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352755.991997 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.006181 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.006485 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.006497 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.006727 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.006743 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.009477 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.009497 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.009594 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.009607 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.009629 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.009643 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.009686 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.557515, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.052979 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.053921 +INFO: TimeDuration, Event = Add_end, Time = 0.000943 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.053938 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.054808 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.054821 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.057683 +INFO: TimeDuration, Event = Pool_end, Time = 0.002861 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.084948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.085666 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.085678 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.086327 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.086355 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.089549 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.107310 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.107774 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.107788 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.108121 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.120816 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.121126 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.121140 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.121371 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.135570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.135874 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.135887 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.136117 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.136133 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.138868 +INFO: TimeDuration, Event = Pool_end, Time = 0.002735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.138885 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.138984 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.138996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.139019 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.139032 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.139081 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.929582, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.181175 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.182097 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.182113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.182972 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000859 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.182986 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.185894 +INFO: TimeDuration, Event = Pool_end, Time = 0.002908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.213139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.213859 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.213872 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.214520 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.214532 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.217726 +INFO: TimeDuration, Event = Pool_end, Time = 0.003195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.235512 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.235976 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.235989 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.236325 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.249017 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.249325 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.249339 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.249567 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.263736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.264042 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.264055 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.264285 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.264299 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.267029 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.267049 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.267146 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.267159 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.267180 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.267194 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.267244 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.667309, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.309414 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.310334 +INFO: TimeDuration, Event = Add_end, Time = 0.000920 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.310348 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.311210 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000862 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.311223 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.314145 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.341435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.342154 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.342168 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.342814 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.342827 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.346034 +INFO: TimeDuration, Event = Pool_end, Time = 0.003207 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.363761 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.364244 +INFO: TimeDuration, Event = Add_end, Time = 0.000483 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.364258 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.364592 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.377259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.377566 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.377578 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.377807 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.393049 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.393355 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.393367 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.393599 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.393615 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.396343 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.396596 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.396694 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.396707 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.396728 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.396742 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.396791 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.802903, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.439533 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.440463 +INFO: TimeDuration, Event = Add_end, Time = 0.000930 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.440625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.441503 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000877 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.441525 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.445312 +INFO: TimeDuration, Event = Pool_end, Time = 0.003787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.471522 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.472241 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.472255 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.472903 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.472916 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.476111 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.493881 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.494345 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.494358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.494695 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.508859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.509179 +INFO: TimeDuration, Event = Add_end, Time = 0.000319 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.509193 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.509425 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.523562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.523864 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.523876 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.524107 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.524122 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.526865 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.526885 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.527000 +INFO: TimeDuration, Event = Mul_end, Time = 0.000115 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.527013 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.527039 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.527052 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.527099 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.956891, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.569183 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.570115 +INFO: TimeDuration, Event = Add_end, Time = 0.000932 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.570129 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.570991 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.571005 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.573903 +INFO: TimeDuration, Event = Pool_end, Time = 0.002898 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.601158 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.601880 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.601894 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.602542 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.602555 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.605753 +INFO: TimeDuration, Event = Pool_end, Time = 0.003198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.623492 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.623955 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.623968 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.624302 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.637022 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.637331 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.637343 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.637574 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.651747 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.652050 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.652062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.652292 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.652315 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.655040 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.655058 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.655154 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.655165 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.655187 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.655200 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.655248 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000048 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.417775, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.697946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.698866 +INFO: TimeDuration, Event = Add_end, Time = 0.000920 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.698880 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.699737 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.699750 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.702664 +INFO: TimeDuration, Event = Pool_end, Time = 0.002914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.729941 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.730661 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.730674 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.731322 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.731334 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.734531 +INFO: TimeDuration, Event = Pool_end, Time = 0.003198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.752316 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.752781 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.752796 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.753130 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.765795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.766101 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.766113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.766342 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.780542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.780844 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.780858 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.781089 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.781105 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.783834 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.783853 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.783947 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.783962 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.783983 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.783996 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.784049 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000053 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.744015, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.826546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.827469 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.827484 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.828348 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.828654 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.832630 +INFO: TimeDuration, Event = Pool_end, Time = 0.003976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.858556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.859274 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.859288 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.859936 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.859948 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.863164 +INFO: TimeDuration, Event = Pool_end, Time = 0.003217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.884234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.884701 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.884717 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.885052 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.897904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.898211 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.898225 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.898453 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.913749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.914054 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.914066 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.914296 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.914345 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.917099 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352756.917125 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352756.917230 +INFO: TimeDuration, Event = Mul_end, Time = 0.000106 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.917247 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.917270 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352756.917286 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352756.917335 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000048 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 100.045589, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.960559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.961482 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.961505 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.962380 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000875 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.962399 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.966321 +INFO: TimeDuration, Event = Pool_end, Time = 0.003922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352756.993171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352756.993890 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352756.993903 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352756.994553 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352756.994564 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352756.997758 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.015599 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.016062 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.016074 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.016410 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.029710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.030023 +INFO: TimeDuration, Event = Add_end, Time = 0.000313 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.030037 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.030266 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.044430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.044735 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.044748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.044980 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.044996 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.047721 +INFO: TimeDuration, Event = Pool_end, Time = 0.002726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.047740 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.047838 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.047851 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.047872 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.047886 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.047928 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.002266, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.092293 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.093244 +INFO: TimeDuration, Event = Add_end, Time = 0.000951 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.093260 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.094125 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.094138 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.097016 +INFO: TimeDuration, Event = Pool_end, Time = 0.002878 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.124234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.124956 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.124972 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.125622 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.125634 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.128831 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.146605 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.147084 +INFO: TimeDuration, Event = Add_end, Time = 0.000479 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.147096 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.147429 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.160106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.160415 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.160579 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.160806 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.174843 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.175148 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.175161 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.175401 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000240 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.175417 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.178136 +INFO: TimeDuration, Event = Pool_end, Time = 0.002719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.178156 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.178260 +INFO: TimeDuration, Event = Mul_end, Time = 0.000104 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.178273 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.178297 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.178310 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.178362 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.436146, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.220421 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.221344 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.221360 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.222225 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.222239 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.225141 +INFO: TimeDuration, Event = Pool_end, Time = 0.002902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.252421 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.253143 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.253155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.253799 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000644 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.253810 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.257011 +INFO: TimeDuration, Event = Pool_end, Time = 0.003200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.274784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.275249 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.275262 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.275596 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.288263 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.288570 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.288583 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.288811 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.303012 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.303314 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.303326 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.303555 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.303571 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.306302 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.306322 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.306418 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.306431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.306453 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.306466 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.306516 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.622714, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.348541 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.349465 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.349482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.350345 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.350359 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.353261 +INFO: TimeDuration, Event = Pool_end, Time = 0.002902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.380486 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.381210 +INFO: TimeDuration, Event = Add_end, Time = 0.000724 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.381223 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.381875 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000652 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.381887 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.385101 +INFO: TimeDuration, Event = Pool_end, Time = 0.003214 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.404758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.405226 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.405239 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.405575 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.418323 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.418632 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.418644 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.418873 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.433074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.433378 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.433392 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.433623 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.433640 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.436365 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.436617 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.436717 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.436732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.436753 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.436767 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.436810 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.798136, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.481109 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.482038 +INFO: TimeDuration, Event = Add_end, Time = 0.000929 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.482061 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.482947 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000886 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.482968 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.486886 +INFO: TimeDuration, Event = Pool_end, Time = 0.003918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.513193 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.513915 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.513929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.514580 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.514592 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.517770 +INFO: TimeDuration, Event = Pool_end, Time = 0.003177 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.538723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.539191 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.539206 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.539542 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.552263 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.552576 +INFO: TimeDuration, Event = Add_end, Time = 0.000313 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.552590 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.552820 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.566997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.567300 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.567313 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.567542 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.567558 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.570292 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.570312 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.570409 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.570421 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.570445 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.570458 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.570507 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.034892, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.612277 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.613226 +INFO: TimeDuration, Event = Add_end, Time = 0.000949 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.613242 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.614114 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000872 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.614126 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.616996 +INFO: TimeDuration, Event = Pool_end, Time = 0.002870 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.644283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.645010 +INFO: TimeDuration, Event = Add_end, Time = 0.000726 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.645023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.645670 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.645683 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.648875 +INFO: TimeDuration, Event = Pool_end, Time = 0.003191 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.666661 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.667124 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.667137 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.667472 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.680148 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.680456 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.680466 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.680696 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.694877 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.695181 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.695192 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.695421 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.695436 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.698165 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.698186 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.698283 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.698314 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.698335 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.698349 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.698393 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.642825, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.740218 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.741135 +INFO: TimeDuration, Event = Add_end, Time = 0.000917 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.741152 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.742014 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.742028 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.744936 +INFO: TimeDuration, Event = Pool_end, Time = 0.002908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.772207 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.772926 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.772940 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.773588 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.773601 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.776796 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.794567 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.795034 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.795046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.795380 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.808044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.808352 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.808587 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.808814 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.822794 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.823099 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.823112 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.823342 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.823357 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.826084 +INFO: TimeDuration, Event = Pool_end, Time = 0.002727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.826103 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.826201 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.826216 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.826237 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.826250 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.826300 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.374752, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.868160 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.869081 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.869098 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.869966 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000868 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.869980 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.872892 +INFO: TimeDuration, Event = Pool_end, Time = 0.002912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.900166 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.900883 +INFO: TimeDuration, Event = Add_end, Time = 0.000717 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.900898 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.901545 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.901557 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.904781 +INFO: TimeDuration, Event = Pool_end, Time = 0.003224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.922559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.923026 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.923047 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.923388 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.936021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.936332 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.936342 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.936573 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.951798 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.952106 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352757.952119 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352757.952350 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352757.952617 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352757.955090 +INFO: TimeDuration, Event = Pool_end, Time = 0.002472 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352757.955109 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352757.955206 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352757.955220 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352757.955242 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352757.955258 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352757.955300 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.403262, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.001786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.002735 +INFO: TimeDuration, Event = Add_end, Time = 0.000949 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.002774 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.003668 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000894 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.003690 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.008584 +INFO: TimeDuration, Event = Pool_end, Time = 0.004894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.033909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.034634 +INFO: TimeDuration, Event = Add_end, Time = 0.000725 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.034647 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.035303 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000656 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.035315 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.038506 +INFO: TimeDuration, Event = Pool_end, Time = 0.003191 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.056487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.056954 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.056968 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.057302 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.070202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.070514 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.070528 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.070757 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.085045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.085351 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.085365 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.085594 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.085626 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.088326 +INFO: TimeDuration, Event = Pool_end, Time = 0.002700 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352758.088869 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352758.089015 +INFO: TimeDuration, Event = Mul_end, Time = 0.000146 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.089031 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.089059 +INFO: TimeDuration, Event = Add_end, Time = 0.000029 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352758.089074 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352758.089122 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000047 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.844215, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.134027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.134948 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.134975 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.135869 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000894 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.135882 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.138742 +INFO: TimeDuration, Event = Pool_end, Time = 0.002859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.166016 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.166734 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.166748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.167393 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.167405 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.170607 +INFO: TimeDuration, Event = Pool_end, Time = 0.003202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.188391 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.188854 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.188881 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.189215 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.201859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.202167 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.202179 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.202408 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.216931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.217236 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.217248 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.217479 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.217495 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.220215 +INFO: TimeDuration, Event = Pool_end, Time = 0.002720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352758.220234 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352758.220331 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.220598 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.220623 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352758.220636 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352758.220680 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.683241, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.270430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.271359 +INFO: TimeDuration, Event = Add_end, Time = 0.000929 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.271380 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.272256 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000876 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.272276 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.275153 +INFO: TimeDuration, Event = Pool_end, Time = 0.002877 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.302750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.303473 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.303492 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.304140 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.304157 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.308401 +INFO: TimeDuration, Event = Pool_end, Time = 0.004244 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.326365 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.326833 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.326852 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.327189 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.340214 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.340527 +INFO: TimeDuration, Event = Add_end, Time = 0.000313 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.340547 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.340788 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.356525 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.356835 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.356854 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.357088 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.357112 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.359822 +INFO: TimeDuration, Event = Pool_end, Time = 0.002710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352758.359848 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352758.359961 +INFO: TimeDuration, Event = Mul_end, Time = 0.000112 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.359979 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.360006 +INFO: TimeDuration, Event = Add_end, Time = 0.000028 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352758.360026 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352758.360085 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000059 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.827574, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.408357 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.409283 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.409299 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.410157 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000858 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.410170 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.413056 +INFO: TimeDuration, Event = Pool_end, Time = 0.002886 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.440321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.441040 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.441053 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.441700 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.441713 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.444908 +INFO: TimeDuration, Event = Pool_end, Time = 0.003195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.463795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.464267 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.464283 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.464621 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.477281 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.477592 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.477606 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.477837 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.493443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.493750 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.493763 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.493994 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.494010 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.496736 +INFO: TimeDuration, Event = Pool_end, Time = 0.002726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352758.496755 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352758.496862 +INFO: TimeDuration, Event = Mul_end, Time = 0.000107 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.496876 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.496897 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352758.496910 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352758.496959 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.179712, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.543860 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.544787 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.544804 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.545668 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.545681 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.548577 +INFO: TimeDuration, Event = Pool_end, Time = 0.002896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.575819 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.576539 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.576549 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.577195 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.577211 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.580410 +INFO: TimeDuration, Event = Pool_end, Time = 0.003200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.598187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.598652 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.598665 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.599001 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.611633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.611940 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.611952 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.612184 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.626363 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.626666 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.626679 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.626910 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.626930 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.629660 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352758.629680 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352758.629777 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.629791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.629812 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352758.629826 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352758.629872 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.497216, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.671970 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.672894 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.672912 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.673778 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.673792 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.676692 +INFO: TimeDuration, Event = Pool_end, Time = 0.002900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.703933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.704653 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.704667 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.705316 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.705329 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.708524 +INFO: TimeDuration, Event = Pool_end, Time = 0.003195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.726305 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.726769 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.726781 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.727115 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.739763 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.740070 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.740081 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.740328 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000247 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.754493 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.754796 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.754809 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.755040 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.755057 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.757787 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352758.757807 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352758.757904 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.757918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.757939 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352758.757952 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352758.758002 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.552111, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.800342 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.801262 +INFO: TimeDuration, Event = Add_end, Time = 0.000920 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.801277 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.802138 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000860 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.802153 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.805057 +INFO: TimeDuration, Event = Pool_end, Time = 0.002904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.832298 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.833026 +INFO: TimeDuration, Event = Add_end, Time = 0.000728 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.833041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.833688 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.833701 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.836897 +INFO: TimeDuration, Event = Pool_end, Time = 0.003195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.854673 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.855138 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.855151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.855483 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.868164 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.868471 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.868583 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.868809 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.882900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.883205 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.883217 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.883446 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.883462 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.886198 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352758.886218 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352758.886320 +INFO: TimeDuration, Event = Mul_end, Time = 0.000102 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.886333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.886355 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352758.886368 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352758.886418 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.516562, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.928252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.929176 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.929191 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.930056 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.930070 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.932973 +INFO: TimeDuration, Event = Pool_end, Time = 0.002903 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.960398 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.961126 +INFO: TimeDuration, Event = Add_end, Time = 0.000728 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.961141 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.961790 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352758.961802 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352758.965109 +INFO: TimeDuration, Event = Pool_end, Time = 0.003307 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.982725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.983192 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.983205 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.983541 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352758.996250 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352758.996558 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352758.996593 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352758.996821 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.010957 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.011262 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.011274 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.011504 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.011521 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.014258 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.014278 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.014375 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.014388 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.014410 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.014423 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.014466 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.708803, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.056509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.057436 +INFO: TimeDuration, Event = Add_end, Time = 0.000927 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.057467 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.058329 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000862 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.058344 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.061224 +INFO: TimeDuration, Event = Pool_end, Time = 0.002880 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.088534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.089255 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.089268 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.089918 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.089930 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.093120 +INFO: TimeDuration, Event = Pool_end, Time = 0.003190 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.110885 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.111348 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.111361 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.111697 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.124379 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.124687 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.124701 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.124931 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.139108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.139411 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.139423 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.139655 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.139671 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.142408 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.142428 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.142527 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.142539 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.142560 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.142574 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.142617 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.650631, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.184643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.185564 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.185580 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.186438 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000858 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.186453 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.189360 +INFO: TimeDuration, Event = Pool_end, Time = 0.002907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.216627 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.217345 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.217358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.218005 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.218016 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.221217 +INFO: TimeDuration, Event = Pool_end, Time = 0.003201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.239015 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.239494 +INFO: TimeDuration, Event = Add_end, Time = 0.000479 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.239507 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.239841 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.252523 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.252829 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.252841 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.253069 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.267229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.267530 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.267544 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.267773 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.267790 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.270530 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.270551 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.270648 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.270661 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.270682 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.270695 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.270738 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.655380, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.313535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.314463 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.314479 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.315347 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000868 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.315361 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.318253 +INFO: TimeDuration, Event = Pool_end, Time = 0.002891 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.345483 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.346202 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.346214 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.346863 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.346875 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.350076 +INFO: TimeDuration, Event = Pool_end, Time = 0.003201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.367863 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.368326 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.368336 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.368670 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.381367 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.381676 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.381694 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.381924 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.396074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.396658 +INFO: TimeDuration, Event = Add_end, Time = 0.000584 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.396676 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.396906 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.396923 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.399368 +INFO: TimeDuration, Event = Pool_end, Time = 0.002445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.399387 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.399485 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.399498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.399519 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.399533 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.399582 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.624122, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.442100 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.443021 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.443037 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.443901 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.443914 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.446826 +INFO: TimeDuration, Event = Pool_end, Time = 0.002911 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.474059 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.474778 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.474792 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.475440 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.475452 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.478650 +INFO: TimeDuration, Event = Pool_end, Time = 0.003198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.496423 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.496887 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.496900 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.497234 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.509940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.510248 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.510261 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.510500 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.524690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.524993 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.525006 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.525235 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.525251 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.527980 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.527998 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.528093 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.528106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.528128 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.528141 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.528190 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.633442, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.570802 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.571727 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.571743 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.572604 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.572628 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.575521 +INFO: TimeDuration, Event = Pool_end, Time = 0.002893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.602782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.603501 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.603515 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.604166 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.604178 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.607379 +INFO: TimeDuration, Event = Pool_end, Time = 0.003201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.625171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.625637 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.625658 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.626000 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000342 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.638644 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.638953 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.638966 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.639195 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.654444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.654750 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.654763 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.654995 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.655011 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.657735 +INFO: TimeDuration, Event = Pool_end, Time = 0.002724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.657755 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.657853 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.657867 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.657888 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.657902 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.657946 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.670346, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.701064 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.701984 +INFO: TimeDuration, Event = Add_end, Time = 0.000920 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.702001 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.702864 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.702878 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.705818 +INFO: TimeDuration, Event = Pool_end, Time = 0.002941 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.733020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.733742 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.733757 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.734403 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.734414 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.737612 +INFO: TimeDuration, Event = Pool_end, Time = 0.003197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.755384 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.755849 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.755862 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.756197 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.768889 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.769195 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.769208 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.769436 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.783589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.783892 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.783906 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.784136 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.784153 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.786884 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.786904 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.787002 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.787015 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.787036 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.787050 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.787098 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.650452, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.829393 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.830316 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.830329 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.831194 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.831208 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.834110 +INFO: TimeDuration, Event = Pool_end, Time = 0.002901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.861353 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.862070 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.862092 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.862741 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.862752 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.865950 +INFO: TimeDuration, Event = Pool_end, Time = 0.003198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.883743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.884208 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.884222 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.884556 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.897253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.897563 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.897575 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.897807 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.911957 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.912259 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.912272 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.912500 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.912592 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.915252 +INFO: TimeDuration, Event = Pool_end, Time = 0.002660 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352759.915291 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352759.915388 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.915402 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.915423 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352759.915437 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352759.915482 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.731406, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.957877 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.958799 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.958814 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.959678 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.959692 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.962598 +INFO: TimeDuration, Event = Pool_end, Time = 0.002905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352759.989875 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352759.990595 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352759.990607 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352759.991253 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352759.991266 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352759.994465 +INFO: TimeDuration, Event = Pool_end, Time = 0.003199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.012212 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.012676 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.012689 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.013023 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.025714 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.026023 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.026035 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.026264 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.040480 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.040784 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.040797 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.041028 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.041045 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.043769 +INFO: TimeDuration, Event = Pool_end, Time = 0.002724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.043788 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.043884 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.043896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.043918 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.043932 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.043974 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.760158, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.086103 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.087022 +INFO: TimeDuration, Event = Add_end, Time = 0.000919 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.087037 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.087901 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.087915 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.090826 +INFO: TimeDuration, Event = Pool_end, Time = 0.002910 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.118069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.118788 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.118801 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.119448 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.119459 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.122666 +INFO: TimeDuration, Event = Pool_end, Time = 0.003206 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.140444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.140907 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.140919 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.141254 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.153946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.154253 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.154265 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.154494 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.168688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.168990 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.169002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.169230 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.169246 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.171978 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.171996 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.172091 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.172105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.172126 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.172139 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.172181 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.612675, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.214361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.215290 +INFO: TimeDuration, Event = Add_end, Time = 0.000929 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.215305 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.216170 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.216184 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.219080 +INFO: TimeDuration, Event = Pool_end, Time = 0.002896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.246349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.247072 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.247092 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.247749 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000657 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.247768 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.250938 +INFO: TimeDuration, Event = Pool_end, Time = 0.003170 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.268710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.269174 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.269187 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.269519 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.282214 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.282524 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.282535 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.282765 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.296950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.297254 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.297265 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.297494 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.297510 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.300241 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.300262 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.300358 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.300368 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.300389 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.300399 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.300442 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.605327, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.343389 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.344332 +INFO: TimeDuration, Event = Add_end, Time = 0.000943 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.344664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.345533 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000869 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.345547 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.349289 +INFO: TimeDuration, Event = Pool_end, Time = 0.003742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.375353 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.376078 +INFO: TimeDuration, Event = Add_end, Time = 0.000725 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.376091 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.376740 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.376754 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.379956 +INFO: TimeDuration, Event = Pool_end, Time = 0.003202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.397855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.398321 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.398333 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.398668 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.411405 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.411712 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.411724 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.411955 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.426169 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.426474 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.426487 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.426717 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.426734 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.429471 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.429492 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.429593 +INFO: TimeDuration, Event = Mul_end, Time = 0.000101 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.429607 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.429631 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.429645 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.429690 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.964809, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.471691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.472638 +INFO: TimeDuration, Event = Add_end, Time = 0.000947 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.472655 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.473520 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.473534 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.476413 +INFO: TimeDuration, Event = Pool_end, Time = 0.002879 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.503655 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.504374 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.504591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.505241 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.505253 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.508262 +INFO: TimeDuration, Event = Pool_end, Time = 0.003009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.526026 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.526495 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.526516 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.526861 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.539466 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.539774 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.539788 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.540018 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.554196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.554500 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.554513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.554743 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.554758 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.557497 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.557518 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.557618 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.557633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.557655 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.557668 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.557729 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000061 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.478130, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.611538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.612464 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.612477 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.613342 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.613356 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.616262 +INFO: TimeDuration, Event = Pool_end, Time = 0.002906 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.643477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.644198 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.644211 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.644861 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.644873 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.648069 +INFO: TimeDuration, Event = Pool_end, Time = 0.003195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.665827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.666291 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.666303 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.666638 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.679269 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.679577 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.679589 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.679819 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.693983 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.694288 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.694301 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.694530 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.694546 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.697275 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.697294 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.697391 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.697405 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.697426 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.697440 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.697483 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.492416, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.740007 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.740931 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.740947 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.741813 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.741829 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.744728 +INFO: TimeDuration, Event = Pool_end, Time = 0.002899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.771864 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.772583 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.772608 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.773256 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.773268 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.776656 +INFO: TimeDuration, Event = Pool_end, Time = 0.003388 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.794227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.794692 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.794704 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.795038 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.807628 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.807936 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.807948 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.808176 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.822348 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.822653 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.822665 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.822895 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.822911 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.825643 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.825663 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.825762 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.825775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.825797 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.825810 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.825854 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.341568, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.867975 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.868898 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.868915 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.869778 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.869792 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.872696 +INFO: TimeDuration, Event = Pool_end, Time = 0.002904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.899814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.900536 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.900546 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.901195 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.901206 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.904403 +INFO: TimeDuration, Event = Pool_end, Time = 0.003197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.922185 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.922649 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.922662 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.922997 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.935595 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.935901 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.935915 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.936143 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.950314 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.950620 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.950632 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.950864 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.950880 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352760.953606 +INFO: TimeDuration, Event = Pool_end, Time = 0.002727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352760.953626 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352760.953723 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.953737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.953758 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352760.953771 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352760.953813 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.333172, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352760.996164 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352760.997090 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352760.997105 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352760.997971 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352760.997986 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.000902 +INFO: TimeDuration, Event = Pool_end, Time = 0.002917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.028030 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.028750 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.028764 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.029415 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.029427 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.032646 +INFO: TimeDuration, Event = Pool_end, Time = 0.003219 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.052561 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.053027 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.053041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.053375 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.066139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.066450 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.066462 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.066705 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000243 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.081946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.082252 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.082264 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.082493 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.082509 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.085244 +INFO: TimeDuration, Event = Pool_end, Time = 0.002735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.085264 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.085362 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.085375 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.085397 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.085410 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.085452 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.885223, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.129069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.130013 +INFO: TimeDuration, Event = Add_end, Time = 0.000943 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.130038 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.130914 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000877 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.130935 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.134822 +INFO: TimeDuration, Event = Pool_end, Time = 0.003887 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.161210 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.161953 +INFO: TimeDuration, Event = Add_end, Time = 0.000743 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.161968 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.162615 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.162627 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.165798 +INFO: TimeDuration, Event = Pool_end, Time = 0.003171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.183598 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.184065 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.184078 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.184414 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.197187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.197497 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.197510 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.197739 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.213855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.214162 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.214176 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.214407 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.214424 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.217135 +INFO: TimeDuration, Event = Pool_end, Time = 0.002711 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.217155 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.217254 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.217267 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.217288 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.217302 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.217345 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.779466, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.259639 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.260561 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.260574 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.261440 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.261455 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.264356 +INFO: TimeDuration, Event = Pool_end, Time = 0.002900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.291589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.292318 +INFO: TimeDuration, Event = Add_end, Time = 0.000729 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.292336 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.292997 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000661 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.293018 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.296164 +INFO: TimeDuration, Event = Pool_end, Time = 0.003146 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.313928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.314394 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.314407 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.314743 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.327338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.327646 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.327658 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.327889 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.342063 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.342369 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.342382 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.342615 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.342632 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.345355 +INFO: TimeDuration, Event = Pool_end, Time = 0.002722 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.345374 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.345471 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.345484 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.345505 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.345518 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.345563 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.431170, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.389037 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.389957 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.389972 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.390838 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.390851 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.393766 +INFO: TimeDuration, Event = Pool_end, Time = 0.002915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.420887 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.421608 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.421622 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.422271 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.422282 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.425480 +INFO: TimeDuration, Event = Pool_end, Time = 0.003198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.443255 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.443720 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.443732 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.444069 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.456730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.457038 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.457051 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.457280 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.471451 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.471754 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.471768 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.471998 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.472014 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.474745 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.474764 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.474860 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.474873 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.474894 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.474907 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.474956 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.794673, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.517101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.518026 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.518040 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.518902 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000862 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.518916 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.521818 +INFO: TimeDuration, Event = Pool_end, Time = 0.002902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.548998 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.549721 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.549734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.550384 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.550396 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.553612 +INFO: TimeDuration, Event = Pool_end, Time = 0.003217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.574765 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.575248 +INFO: TimeDuration, Event = Add_end, Time = 0.000484 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.575260 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.575592 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.588361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.588669 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.588683 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.588913 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.604190 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.604496 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.604584 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.604811 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.604830 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.607482 +INFO: TimeDuration, Event = Pool_end, Time = 0.002652 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.607501 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.607598 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.607610 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.607632 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.607644 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.607694 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 100.128707, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.651533 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.652455 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.652477 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.653349 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000872 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.653373 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.657295 +INFO: TimeDuration, Event = Pool_end, Time = 0.003923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.683703 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.684425 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.684596 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.685245 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.685258 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.688293 +INFO: TimeDuration, Event = Pool_end, Time = 0.003036 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.706096 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.706561 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.706596 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.706929 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.723491 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.723804 +INFO: TimeDuration, Event = Add_end, Time = 0.000313 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.723819 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.724052 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.738210 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.738512 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.738526 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.738753 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.738770 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.741506 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.741525 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.741624 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.741638 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.741659 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.741673 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.741716 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.358071, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.783687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.784615 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.784641 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.785508 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.785522 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.788667 +INFO: TimeDuration, Event = Pool_end, Time = 0.003145 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.815617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.816337 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.816588 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.817236 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.817248 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.820205 +INFO: TimeDuration, Event = Pool_end, Time = 0.002957 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.837987 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.838451 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.838464 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.838800 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.851445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.851752 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.851765 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.851996 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.866163 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.866467 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.866480 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.866709 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.866724 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.869456 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.869476 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.869573 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.869585 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.869606 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.869620 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.869670 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.281306, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.912024 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.912948 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.912967 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.913834 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.913847 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.916747 +INFO: TimeDuration, Event = Pool_end, Time = 0.002899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.943933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.944654 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.944668 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.945318 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.945329 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.948522 +INFO: TimeDuration, Event = Pool_end, Time = 0.003193 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.966320 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.966786 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.966799 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.967133 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.979778 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.980102 +INFO: TimeDuration, Event = Add_end, Time = 0.000324 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.980115 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.980352 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.994509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.994842 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352761.994856 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352761.995100 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000245 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352761.995120 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352761.997799 +INFO: TimeDuration, Event = Pool_end, Time = 0.002679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352761.997819 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352761.997916 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352761.997930 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352761.997952 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352761.997965 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352761.998008 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.265552, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.040566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.041491 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.041507 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.042376 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000868 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.042389 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.045285 +INFO: TimeDuration, Event = Pool_end, Time = 0.002896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.072448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.073170 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.073184 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.073846 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000662 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.073858 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.077062 +INFO: TimeDuration, Event = Pool_end, Time = 0.003205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.098408 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.098883 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.098895 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.099238 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000343 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.111986 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.112298 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.112625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.112861 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.127795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.128100 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.128113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.128345 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.128579 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.131093 +INFO: TimeDuration, Event = Pool_end, Time = 0.002513 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352762.131112 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352762.131216 +INFO: TimeDuration, Event = Mul_end, Time = 0.000104 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.131229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.131252 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352762.131265 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352762.131313 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000048 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.779873, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.174283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.175201 +INFO: TimeDuration, Event = Add_end, Time = 0.000918 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.175225 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.176100 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000875 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.176121 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.180042 +INFO: TimeDuration, Event = Pool_end, Time = 0.003921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.206442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.207161 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.207174 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.207824 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.207837 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.211036 +INFO: TimeDuration, Event = Pool_end, Time = 0.003199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.230915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.231381 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.231395 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.231728 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.244425 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.244733 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.244745 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.244974 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.259129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.259432 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.259467 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.259697 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.259713 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.262424 +INFO: TimeDuration, Event = Pool_end, Time = 0.002711 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352762.262444 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352762.262541 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.262554 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.262575 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352762.262589 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352762.262632 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.970744, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.305095 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.306010 +INFO: TimeDuration, Event = Add_end, Time = 0.000915 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.306026 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.306892 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.306905 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.309812 +INFO: TimeDuration, Event = Pool_end, Time = 0.002907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.337020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.337739 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.337752 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.338400 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.338410 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.341615 +INFO: TimeDuration, Event = Pool_end, Time = 0.003205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.359393 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.359859 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.359871 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.360207 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.372837 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.373144 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.373158 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.373393 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.388769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.389085 +INFO: TimeDuration, Event = Add_end, Time = 0.000316 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.389098 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.389329 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.389346 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.392065 +INFO: TimeDuration, Event = Pool_end, Time = 0.002719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352762.392085 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352762.392202 +INFO: TimeDuration, Event = Mul_end, Time = 0.000118 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.392215 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.392240 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352762.392254 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352762.392302 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000048 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.821664, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.434679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.435610 +INFO: TimeDuration, Event = Add_end, Time = 0.000931 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.435626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.436492 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.436505 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.439395 +INFO: TimeDuration, Event = Pool_end, Time = 0.002891 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.466550 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.467270 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.467283 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.467935 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000652 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.467947 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.471142 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.488933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.489398 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.489411 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.489742 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.502371 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.502681 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.502694 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.502922 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.517087 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.517391 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.517404 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.517634 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.517651 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.520378 +INFO: TimeDuration, Event = Pool_end, Time = 0.002728 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352762.520394 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352762.520680 +INFO: TimeDuration, Event = Mul_end, Time = 0.000286 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.520695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.520717 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352762.520730 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352762.520794 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000064 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.622661, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.563086 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.564009 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.564023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.564890 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.564908 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.567803 +INFO: TimeDuration, Event = Pool_end, Time = 0.002895 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.594965 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.595688 +INFO: TimeDuration, Event = Add_end, Time = 0.000724 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.595709 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.596364 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000655 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.596606 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.599541 +INFO: TimeDuration, Event = Pool_end, Time = 0.002935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.617325 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.617793 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.617807 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.618142 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.630755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.631070 +INFO: TimeDuration, Event = Add_end, Time = 0.000315 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.631083 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.631312 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.645505 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.645811 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.645824 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.646055 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.646072 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.650977 +INFO: TimeDuration, Event = Pool_end, Time = 0.004905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352762.651006 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352762.651120 +INFO: TimeDuration, Event = Mul_end, Time = 0.000114 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.651134 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.651158 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352762.651171 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352762.651225 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000054 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.439010, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.693601 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.694536 +INFO: TimeDuration, Event = Add_end, Time = 0.000935 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.694560 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.695432 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000873 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.695452 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.699376 +INFO: TimeDuration, Event = Pool_end, Time = 0.003923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.725507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.726233 +INFO: TimeDuration, Event = Add_end, Time = 0.000726 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.726246 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.726891 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000645 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.726903 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.730097 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.747859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.748325 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.748585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.748919 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.761338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.761648 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.761662 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.761890 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.776026 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.776331 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.776341 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.776571 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.776588 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.779322 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352762.779341 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352762.779439 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.779452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.779495 +INFO: TimeDuration, Event = Add_end, Time = 0.000043 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352762.779510 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352762.779555 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.706491, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.821648 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.822572 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.822587 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.823448 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.823461 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.826366 +INFO: TimeDuration, Event = Pool_end, Time = 0.002905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.853546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.854264 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.854278 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.854924 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.854945 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.858158 +INFO: TimeDuration, Event = Pool_end, Time = 0.003213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.878227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.878713 +INFO: TimeDuration, Event = Add_end, Time = 0.000485 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.878726 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.879062 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.892680 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.892991 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.893005 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.893234 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.907410 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.907712 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.907725 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.907955 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.907972 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.910705 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352762.910724 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352762.910821 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.910833 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.910855 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352762.910869 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352762.910913 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.897620, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.954437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.955363 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.955385 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.956259 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000873 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.956280 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.960210 +INFO: TimeDuration, Event = Pool_end, Time = 0.003929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352762.986405 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352762.987124 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352762.987138 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352762.987782 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000644 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352762.987793 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352762.990995 +INFO: TimeDuration, Event = Pool_end, Time = 0.003202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.008766 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.009232 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.009245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.009578 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.022225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.022531 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.022544 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.022772 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.036948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.037250 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.037262 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.037489 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.037505 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.040241 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.040260 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.040573 +INFO: TimeDuration, Event = Mul_end, Time = 0.000313 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.040588 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.040613 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.040627 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.040671 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.797705, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.082932 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.083861 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.083875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.084743 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000868 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.084759 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.087653 +INFO: TimeDuration, Event = Pool_end, Time = 0.002895 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.114835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.115554 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.115568 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.116218 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.116240 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.122586 +INFO: TimeDuration, Event = Pool_end, Time = 0.006346 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.140414 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.140884 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.140898 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.141232 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.154132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.154441 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.154454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.154682 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.168994 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.169302 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.169316 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.169547 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.169565 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.172287 +INFO: TimeDuration, Event = Pool_end, Time = 0.002722 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.172313 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.172415 +INFO: TimeDuration, Event = Mul_end, Time = 0.000102 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.172429 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.172455 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.172469 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.172524 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000055 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.075251, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.223305 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.224260 +INFO: TimeDuration, Event = Add_end, Time = 0.000955 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.224294 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.225433 +INFO: TimeDuration, Event = Tanh_end, Time = 0.001139 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.225453 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.229013 +INFO: TimeDuration, Event = Pool_end, Time = 0.003560 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.255877 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.256614 +INFO: TimeDuration, Event = Add_end, Time = 0.000737 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.256633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.257283 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.257298 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.261291 +INFO: TimeDuration, Event = Pool_end, Time = 0.003993 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.279264 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.279736 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.279753 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.280091 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.293068 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.293381 +INFO: TimeDuration, Event = Add_end, Time = 0.000313 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.293397 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.293628 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.309391 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.309704 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.309721 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.309953 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.309974 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.312689 +INFO: TimeDuration, Event = Pool_end, Time = 0.002714 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.312713 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.312819 +INFO: TimeDuration, Event = Mul_end, Time = 0.000107 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.312835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.312859 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.312876 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.312926 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 102.903166, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.359663 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.360588 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.360604 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.361474 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.361529 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.364384 +INFO: TimeDuration, Event = Pool_end, Time = 0.002856 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.391633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.392482 +INFO: TimeDuration, Event = Add_end, Time = 0.000849 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.392494 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.393146 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.393159 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.396225 +INFO: TimeDuration, Event = Pool_end, Time = 0.003067 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.416102 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.416570 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.416598 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.416935 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.429553 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.429863 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.429877 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.430105 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.445713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.446021 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.446035 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.446266 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.446283 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.449006 +INFO: TimeDuration, Event = Pool_end, Time = 0.002723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.449026 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.449124 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.449137 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.449159 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.449172 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.449223 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.996541, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.494062 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.494992 +INFO: TimeDuration, Event = Add_end, Time = 0.000930 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.495007 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.495875 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000868 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.495890 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.498786 +INFO: TimeDuration, Event = Pool_end, Time = 0.002895 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.525951 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.526678 +INFO: TimeDuration, Event = Add_end, Time = 0.000727 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.526691 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.527340 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.527352 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.530539 +INFO: TimeDuration, Event = Pool_end, Time = 0.003187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.548256 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.548722 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.548737 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.549068 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.561765 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.562075 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.562088 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.562317 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.576493 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.576797 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.576809 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.577041 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.577057 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.579787 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.579805 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.579901 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.579913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.579938 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.579951 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.579996 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.494759, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.617065 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.617989 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.618003 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.618871 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000868 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.618884 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.621783 +INFO: TimeDuration, Event = Pool_end, Time = 0.002899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.649010 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.649731 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.649743 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.650390 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.650400 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.653596 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.671349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.671813 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.671826 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.672158 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.685804 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.686148 +INFO: TimeDuration, Event = Add_end, Time = 0.000344 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.686183 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.686428 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000245 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.701716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.702028 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.702044 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.702279 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.702300 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.705014 +INFO: TimeDuration, Event = Pool_end, Time = 0.002714 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.705042 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.705149 +INFO: TimeDuration, Event = Mul_end, Time = 0.000107 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.705166 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.705190 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.705207 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.705256 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.808945, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.743720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.744641 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.744656 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.745517 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000860 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.745530 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.748439 +INFO: TimeDuration, Event = Pool_end, Time = 0.002909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.775624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.776498 +INFO: TimeDuration, Event = Add_end, Time = 0.000874 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.776577 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.777226 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.777238 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.785470 +INFO: TimeDuration, Event = Pool_end, Time = 0.008233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.801471 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.801939 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.801952 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.802289 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.815051 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.815362 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.815374 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.815603 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.829828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.830132 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.830145 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.830374 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.830391 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.833122 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.833141 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.833240 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.833254 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.833275 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.833288 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.833332 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.066627, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.872842 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.873765 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.873780 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.874649 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000869 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.874660 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.877562 +INFO: TimeDuration, Event = Pool_end, Time = 0.002902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.905805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.906529 +INFO: TimeDuration, Event = Add_end, Time = 0.000724 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.906542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.907191 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.907202 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.910385 +INFO: TimeDuration, Event = Pool_end, Time = 0.003183 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.928172 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.928638 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.928652 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.928986 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.941666 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.941973 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.941986 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.942216 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.956383 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.956686 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352763.956698 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352763.956927 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352763.956943 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352763.959676 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352763.959695 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352763.959792 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352763.959805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352763.959827 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352763.959840 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352763.959884 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.607040, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.002574 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.003498 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.003513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.004381 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.004609 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.008508 +INFO: TimeDuration, Event = Pool_end, Time = 0.003899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.034476 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.035200 +INFO: TimeDuration, Event = Add_end, Time = 0.000724 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.035213 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.035865 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000653 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.035876 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.039070 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.061648 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.062119 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.062133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.062469 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.075314 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.075622 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.075635 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.075867 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.090121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.090426 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.090440 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.090670 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.090686 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.093413 +INFO: TimeDuration, Event = Pool_end, Time = 0.002727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352764.093434 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352764.093533 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.093545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.093567 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352764.093580 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352764.093623 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 100.381268, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.141789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.142712 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.142727 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.143593 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.143606 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.146497 +INFO: TimeDuration, Event = Pool_end, Time = 0.002890 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.174412 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.175138 +INFO: TimeDuration, Event = Add_end, Time = 0.000726 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.175152 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.175801 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.175812 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.178994 +INFO: TimeDuration, Event = Pool_end, Time = 0.003182 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.200651 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.201122 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.201136 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.201476 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.214299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.214627 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.214639 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.214870 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.231757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.232069 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.232085 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.232329 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000244 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.232355 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.235053 +INFO: TimeDuration, Event = Pool_end, Time = 0.002698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352764.235076 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352764.235178 +INFO: TimeDuration, Event = Mul_end, Time = 0.000103 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.235193 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.235217 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352764.235232 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352764.235278 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 103.250393, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.280891 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.281814 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.281829 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.282708 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000878 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.282722 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.285600 +INFO: TimeDuration, Event = Pool_end, Time = 0.002878 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.313607 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.314329 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.314343 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.314989 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.315001 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.318179 +INFO: TimeDuration, Event = Pool_end, Time = 0.003178 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.340876 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.341348 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.341361 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.341700 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000339 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.354364 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.354675 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.354688 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.354919 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.370463 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.370771 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.370785 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.371018 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.371033 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.373757 +INFO: TimeDuration, Event = Pool_end, Time = 0.002724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352764.373777 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352764.373875 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.373887 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.373909 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352764.373923 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352764.373973 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 105.513650, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.418438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.419366 +INFO: TimeDuration, Event = Add_end, Time = 0.000927 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.419381 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.420244 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.420259 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.423154 +INFO: TimeDuration, Event = Pool_end, Time = 0.002895 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.450311 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.451033 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.451046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.451695 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.451707 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.454908 +INFO: TimeDuration, Event = Pool_end, Time = 0.003201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.472676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.473140 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.473153 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.473485 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.486151 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.486461 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.486473 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.486703 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.501114 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.501421 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.501433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.501667 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.501683 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.504407 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352764.504423 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352764.504521 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.504534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.504556 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352764.504569 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352764.504612 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.657802, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.547469 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.548394 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.548604 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.549467 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.549483 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.553246 +INFO: TimeDuration, Event = Pool_end, Time = 0.003763 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.579344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.580065 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.580079 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.580729 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.580742 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.583938 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.601696 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.602162 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.602175 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.602511 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.615118 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.615425 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.615437 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.615664 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.631262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.631570 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.631583 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.631814 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.631831 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.634553 +INFO: TimeDuration, Event = Pool_end, Time = 0.002722 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352764.634574 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352764.634673 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.634686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.634707 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352764.634720 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352764.634764 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.638468, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.681282 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.682210 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.682224 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.683086 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.683101 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.686001 +INFO: TimeDuration, Event = Pool_end, Time = 0.002900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.714161 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.714883 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.714897 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.715544 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.715556 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.718743 +INFO: TimeDuration, Event = Pool_end, Time = 0.003186 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.741343 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.741813 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.741827 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.742168 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.754797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.755108 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.755120 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.755351 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.770921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.771230 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.771244 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.771478 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.771495 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.774213 +INFO: TimeDuration, Event = Pool_end, Time = 0.002718 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352764.774234 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352764.774333 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.774346 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.774369 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352764.774382 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352764.774427 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 103.794670, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.814559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.815485 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.815499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.816366 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.816379 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.819292 +INFO: TimeDuration, Event = Pool_end, Time = 0.002913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.846420 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.847141 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.847154 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.847802 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.847813 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.851014 +INFO: TimeDuration, Event = Pool_end, Time = 0.003201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.868788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.869254 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.869267 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.869602 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.882255 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.882565 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.882577 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.882806 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.896978 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.897281 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.897294 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.897523 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.897539 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.900271 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352764.900289 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352764.900409 +INFO: TimeDuration, Event = Mul_end, Time = 0.000119 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.900419 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.900440 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352764.900450 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352764.900492 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.550525, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.941410 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.942348 +INFO: TimeDuration, Event = Add_end, Time = 0.000939 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.942370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.943240 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.943259 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.946131 +INFO: TimeDuration, Event = Pool_end, Time = 0.002872 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352764.973617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352764.974334 +INFO: TimeDuration, Event = Add_end, Time = 0.000717 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352764.974358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352764.975007 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352764.975025 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352764.978199 +INFO: TimeDuration, Event = Pool_end, Time = 0.003174 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.002045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.002517 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.002535 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.002874 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000339 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.015818 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.016129 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.016149 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.016382 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.031890 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.032193 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.032207 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.032436 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.032453 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.035185 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.035204 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.035317 +INFO: TimeDuration, Event = Mul_end, Time = 0.000114 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.035332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.035355 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.035369 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.035419 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 104.189396, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.077808 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.078728 +INFO: TimeDuration, Event = Add_end, Time = 0.000920 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.078746 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.079609 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.079623 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.082536 +INFO: TimeDuration, Event = Pool_end, Time = 0.002913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.109579 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.110289 +INFO: TimeDuration, Event = Add_end, Time = 0.000711 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.110305 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.110952 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.110964 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.114155 +INFO: TimeDuration, Event = Pool_end, Time = 0.003190 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.131860 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.132322 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.132440 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.132774 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.145243 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.145548 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.145561 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.145790 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.159925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.160225 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.160239 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.160467 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.160485 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.163222 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.163241 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.163332 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.163345 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.163366 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.163380 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.163425 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 94.991052, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.206987 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.207901 +INFO: TimeDuration, Event = Add_end, Time = 0.000914 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.207918 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.208776 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000858 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.208793 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.211711 +INFO: TimeDuration, Event = Pool_end, Time = 0.002918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.238780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.239488 +INFO: TimeDuration, Event = Add_end, Time = 0.000708 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.239503 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.240149 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.240161 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.245465 +INFO: TimeDuration, Event = Pool_end, Time = 0.005304 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.263153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.263616 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.263631 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.263962 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.276576 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.276880 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.276895 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.277125 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.292295 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.292607 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.292622 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.292851 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.292869 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.295589 +INFO: TimeDuration, Event = Pool_end, Time = 0.002720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.295609 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.295706 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.295719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.295741 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.295755 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.295801 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.236989, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.344780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.345721 +INFO: TimeDuration, Event = Add_end, Time = 0.000940 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.345750 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.346626 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000875 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.346652 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.350517 +INFO: TimeDuration, Event = Pool_end, Time = 0.003865 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.376643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.377364 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.377378 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.378026 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.378037 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.381235 +INFO: TimeDuration, Event = Pool_end, Time = 0.003198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.402642 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.403112 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.403125 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.403460 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.416328 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.416636 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.416649 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.416880 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.431136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.431442 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.431456 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.431686 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.431705 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.434428 +INFO: TimeDuration, Event = Pool_end, Time = 0.002724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.434448 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.434545 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.434557 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.434578 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.434590 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.434639 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 101.535042, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.481825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.482751 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.482766 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.483624 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.483638 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.486544 +INFO: TimeDuration, Event = Pool_end, Time = 0.002907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.514478 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.515202 +INFO: TimeDuration, Event = Add_end, Time = 0.000724 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.515216 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.515866 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.515878 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.519074 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.537973 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.538440 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.538454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.538792 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.551739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.552052 +INFO: TimeDuration, Event = Add_end, Time = 0.000312 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.552064 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.552295 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.570798 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.571105 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.571119 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.571350 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.571368 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.574094 +INFO: TimeDuration, Event = Pool_end, Time = 0.002726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.574132 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.574233 +INFO: TimeDuration, Event = Mul_end, Time = 0.000101 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.574246 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.574270 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.574283 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.574335 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 102.825489, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.614216 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.615142 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.615182 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.616045 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.616060 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.618926 +INFO: TimeDuration, Event = Pool_end, Time = 0.002866 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.646139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.646858 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.646871 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.647518 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.647530 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.650736 +INFO: TimeDuration, Event = Pool_end, Time = 0.003206 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.668493 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.668958 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.668972 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.669306 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.681964 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.682274 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.682286 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.682516 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.696711 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.697018 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.697032 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.697262 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.697278 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.700001 +INFO: TimeDuration, Event = Pool_end, Time = 0.002723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.700022 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.700117 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.700130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.700153 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.700165 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.700207 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.877844, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.737448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.738376 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.738392 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.739257 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.739270 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.742164 +INFO: TimeDuration, Event = Pool_end, Time = 0.002895 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.769311 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.770033 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.770046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.770696 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.770707 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.773901 +INFO: TimeDuration, Event = Pool_end, Time = 0.003193 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.791656 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.792120 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.792133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.792467 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.805106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.805414 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.805426 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.805654 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.819814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.820119 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.820131 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.820368 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.820389 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.823109 +INFO: TimeDuration, Event = Pool_end, Time = 0.002720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.823129 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.823226 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.823239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.823260 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.823273 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.823317 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.658047, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.861047 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.861970 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.861985 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.862844 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000859 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.862857 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.865768 +INFO: TimeDuration, Event = Pool_end, Time = 0.002911 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.892979 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.893700 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.893713 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.894362 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.894373 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.897572 +INFO: TimeDuration, Event = Pool_end, Time = 0.003199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.915338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.915802 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.915816 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.916152 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.928879 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.929193 +INFO: TimeDuration, Event = Add_end, Time = 0.000315 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.929207 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.929438 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.943563 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.943869 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.943882 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.944111 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.944127 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.946857 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352765.946877 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352765.946976 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.946988 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.947010 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352765.947023 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352765.947068 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.590670, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352765.989897 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352765.990819 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352765.990834 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352765.991699 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352765.991712 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352765.994627 +INFO: TimeDuration, Event = Pool_end, Time = 0.002914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.021786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.022514 +INFO: TimeDuration, Event = Add_end, Time = 0.000728 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.022529 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.023175 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.023188 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.027424 +INFO: TimeDuration, Event = Pool_end, Time = 0.004236 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.045180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.045648 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.045661 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.045994 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.058645 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.058954 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.058965 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.059193 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.073363 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.073668 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.073681 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.073910 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.073928 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.076658 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.076678 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.076776 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.076788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.076811 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.076824 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.076875 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.573635, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.127312 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.128237 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.128253 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.129115 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000862 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.129129 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.132036 +INFO: TimeDuration, Event = Pool_end, Time = 0.002907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.159212 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.159938 +INFO: TimeDuration, Event = Add_end, Time = 0.000726 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.159951 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.160598 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.160637 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.163799 +INFO: TimeDuration, Event = Pool_end, Time = 0.003162 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.185727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.186197 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.186211 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.186546 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.199256 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.199567 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.199579 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.199809 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.215555 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.215862 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.215875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.216106 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.216123 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.218847 +INFO: TimeDuration, Event = Pool_end, Time = 0.002724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.218867 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.218965 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.218978 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.219000 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.219012 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.219065 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 102.577298, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.268874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.269799 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.269815 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.270682 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.270695 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.273591 +INFO: TimeDuration, Event = Pool_end, Time = 0.002896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.300874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.301597 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.301611 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.302259 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.302272 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.305466 +INFO: TimeDuration, Event = Pool_end, Time = 0.003195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.326647 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.327133 +INFO: TimeDuration, Event = Add_end, Time = 0.000486 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.327147 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.327485 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.340239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.340549 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.340560 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.340791 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.356076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.356383 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.356565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.356792 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.356811 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.359369 +INFO: TimeDuration, Event = Pool_end, Time = 0.002557 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.359387 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.359485 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.359498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.359522 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.359535 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.359584 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 100.149276, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.402180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.403103 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.403120 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.403982 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.403995 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.409030 +INFO: TimeDuration, Event = Pool_end, Time = 0.005035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.434207 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.434929 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.434942 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.435588 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.435600 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.438782 +INFO: TimeDuration, Event = Pool_end, Time = 0.003182 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.456562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.457027 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.457040 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.457373 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.470045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.470354 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.470366 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.470596 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.484875 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.485181 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.485194 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.485424 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.485441 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.488168 +INFO: TimeDuration, Event = Pool_end, Time = 0.002727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.488186 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.488285 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.488299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.488326 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.488573 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.488618 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.150039, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.526119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.527046 +INFO: TimeDuration, Event = Add_end, Time = 0.000927 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.527061 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.527926 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.527939 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.530840 +INFO: TimeDuration, Event = Pool_end, Time = 0.002901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.558024 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.558744 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.558758 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.559403 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000645 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.559415 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.562614 +INFO: TimeDuration, Event = Pool_end, Time = 0.003199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.580396 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.580863 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.580877 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.581213 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.593980 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.594291 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.594305 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.594536 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.608694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.608997 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.609010 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.609240 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.609256 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.611990 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.612010 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.612107 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.612119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.612143 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.612157 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.612200 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.937245, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.654583 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.655507 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.655523 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.656394 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000871 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.656619 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.660551 +INFO: TimeDuration, Event = Pool_end, Time = 0.003933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.686487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.687209 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.687222 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.687872 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.687882 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.691078 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.708854 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.709320 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.709333 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.709665 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.722330 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.722661 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.722674 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.722901 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.737066 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.737370 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.737382 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.737614 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.737632 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.740360 +INFO: TimeDuration, Event = Pool_end, Time = 0.002728 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.740582 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.740684 +INFO: TimeDuration, Event = Mul_end, Time = 0.000102 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.740697 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.740719 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.740734 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.740778 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.778130, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.778004 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.778930 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.778944 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.779805 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000860 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.779817 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.782726 +INFO: TimeDuration, Event = Pool_end, Time = 0.002909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.809920 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.810643 +INFO: TimeDuration, Event = Add_end, Time = 0.000724 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.810656 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.811304 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.811316 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.814507 +INFO: TimeDuration, Event = Pool_end, Time = 0.003191 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.832279 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.832746 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.832760 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.833092 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.845757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.846065 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.846078 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.846307 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.860487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.860791 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.860806 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.861036 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.861052 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.863790 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.863810 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.863908 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.863921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.863944 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.863959 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.864011 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.885247, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.901161 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.902086 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.902100 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.902962 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.902975 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.905877 +INFO: TimeDuration, Event = Pool_end, Time = 0.002902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.933083 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.933803 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.933817 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.934464 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.934475 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.937675 +INFO: TimeDuration, Event = Pool_end, Time = 0.003200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.955447 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.955913 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.955927 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.956261 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.968950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.969259 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.969271 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.969500 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.983642 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.983946 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352766.983958 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352766.984188 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352766.984204 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352766.986954 +INFO: TimeDuration, Event = Pool_end, Time = 0.002750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352766.986974 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352766.987072 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352766.987084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352766.987106 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352766.987118 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352766.987161 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.788767, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.024164 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.025084 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.025101 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.025959 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000858 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.025974 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.028907 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.056052 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.056775 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.056791 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.057435 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000644 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.057447 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.060643 +INFO: TimeDuration, Event = Pool_end, Time = 0.003196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.078396 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.078862 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.078875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.079210 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.091865 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.092172 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.092184 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.092413 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.106608 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.106911 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.106923 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.107151 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.107167 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.109903 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352767.109923 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352767.110020 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.110032 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.110053 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352767.110067 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352767.110109 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.448193, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.148316 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.149239 +INFO: TimeDuration, Event = Add_end, Time = 0.000923 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.149255 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.150118 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.150130 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.153029 +INFO: TimeDuration, Event = Pool_end, Time = 0.002898 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.180239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.180961 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.181097 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.181738 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000641 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.181749 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.184830 +INFO: TimeDuration, Event = Pool_end, Time = 0.003081 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.202612 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.203077 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.203090 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.203425 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.216095 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.216405 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.216572 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.216801 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.230841 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.231147 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.231159 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.231391 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.231436 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.234133 +INFO: TimeDuration, Event = Pool_end, Time = 0.002697 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352767.234156 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352767.234255 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.234267 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.234289 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352767.234303 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352767.234347 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.530501, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.271996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.272925 +INFO: TimeDuration, Event = Add_end, Time = 0.000929 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.272944 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.273808 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.273823 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.276716 +INFO: TimeDuration, Event = Pool_end, Time = 0.002893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.303833 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.304554 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.304565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.305220 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000655 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.305232 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.308445 +INFO: TimeDuration, Event = Pool_end, Time = 0.003213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.330767 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.331236 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.331250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.331585 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.344384 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.344692 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.344705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.344935 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.360666 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.360977 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.360991 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.361221 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.361240 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.363960 +INFO: TimeDuration, Event = Pool_end, Time = 0.002720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352767.363980 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352767.364078 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.364091 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.364112 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352767.364125 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352767.364169 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 101.741349, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.411455 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.412441 +INFO: TimeDuration, Event = Add_end, Time = 0.000986 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.412601 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.413465 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.413481 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.418282 +INFO: TimeDuration, Event = Pool_end, Time = 0.004802 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.444376 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.445096 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.445110 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.445757 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.445770 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.448947 +INFO: TimeDuration, Event = Pool_end, Time = 0.003178 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.466719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.467183 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.467196 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.467531 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.480204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.480512 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.480564 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.480793 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.494927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.495232 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.495245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.495477 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.495494 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.498220 +INFO: TimeDuration, Event = Pool_end, Time = 0.002726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352767.498240 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352767.498340 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.498353 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.498375 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352767.498388 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352767.498439 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 101.353540, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.535699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.536616 +INFO: TimeDuration, Event = Add_end, Time = 0.000917 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.536633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.537499 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.537513 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.540390 +INFO: TimeDuration, Event = Pool_end, Time = 0.002877 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.567614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.568334 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.568586 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.569231 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000645 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.569243 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.572201 +INFO: TimeDuration, Event = Pool_end, Time = 0.002958 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.593885 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.594354 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.594368 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.594706 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.607598 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.607905 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.607918 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.608146 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.623477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.623785 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.623799 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.624030 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.624047 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.626771 +INFO: TimeDuration, Event = Pool_end, Time = 0.002724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352767.626791 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352767.626889 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.626901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.626923 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352767.626935 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352767.626984 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 100.841580, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.669597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.670508 +INFO: TimeDuration, Event = Add_end, Time = 0.000911 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.670524 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.671385 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.671400 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.676480 +INFO: TimeDuration, Event = Pool_end, Time = 0.005080 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.702129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.702847 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.702861 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.703506 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.703517 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.706702 +INFO: TimeDuration, Event = Pool_end, Time = 0.003184 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.724575 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.725039 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.725051 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.725383 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.743256 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.743573 +INFO: TimeDuration, Event = Add_end, Time = 0.000317 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.743587 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.743814 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.758050 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.758357 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.758371 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.758602 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.758621 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.761341 +INFO: TimeDuration, Event = Pool_end, Time = 0.002721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352767.761362 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352767.761461 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.761473 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.761495 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352767.761507 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352767.761551 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 101.913287, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.799110 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.800034 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.800049 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.800920 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.800938 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.803831 +INFO: TimeDuration, Event = Pool_end, Time = 0.002894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.834253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.834979 +INFO: TimeDuration, Event = Add_end, Time = 0.000725 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.834992 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.835642 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.835654 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.838834 +INFO: TimeDuration, Event = Pool_end, Time = 0.003179 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.856591 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.857058 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.857071 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.857406 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.870213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.870523 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.870536 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.870767 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.886331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.886638 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.886652 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.886884 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.886901 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.889625 +INFO: TimeDuration, Event = Pool_end, Time = 0.002724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352767.889645 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352767.889745 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.889775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.889796 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352767.889811 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352767.889870 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000059 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 100.244905, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.929952 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.930874 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.930889 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.931752 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.931767 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.934670 +INFO: TimeDuration, Event = Pool_end, Time = 0.002904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.961862 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.962582 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.962594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.963243 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352767.963254 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352767.966451 +INFO: TimeDuration, Event = Pool_end, Time = 0.003197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.984216 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.987707 +INFO: TimeDuration, Event = Add_end, Time = 0.003491 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.987731 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.988084 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000353 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352767.997952 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352767.998262 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352767.998276 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352767.998504 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.012683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.012985 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.012997 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.013229 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.013244 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.015981 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.016000 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.016105 +INFO: TimeDuration, Event = Mul_end, Time = 0.000105 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.016118 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.016140 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.016153 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.016204 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.082270, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.053755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.054680 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.054696 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.055563 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000867 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.055576 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.058475 +INFO: TimeDuration, Event = Pool_end, Time = 0.002899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.085651 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.086372 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.086384 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.087031 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.087042 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.090241 +INFO: TimeDuration, Event = Pool_end, Time = 0.003200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.113898 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.114370 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.114384 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.114719 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.127342 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.127651 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.127663 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.127899 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000236 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.143432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.143738 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.143751 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.143981 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.143998 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.146727 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.146747 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.146845 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.146858 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.146879 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.146893 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.146935 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 103.064785, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.189964 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.190891 +INFO: TimeDuration, Event = Add_end, Time = 0.000927 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.190906 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.191772 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.191786 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.196797 +INFO: TimeDuration, Event = Pool_end, Time = 0.005011 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.221904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.222627 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.222640 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.223289 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.223300 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.226492 +INFO: TimeDuration, Event = Pool_end, Time = 0.003192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.244271 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.244736 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.244751 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.245093 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000342 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.257741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.258049 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.258061 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.258290 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.273960 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.274279 +INFO: TimeDuration, Event = Add_end, Time = 0.000318 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.274292 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.274526 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.274543 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.277267 +INFO: TimeDuration, Event = Pool_end, Time = 0.002723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.277288 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.277408 +INFO: TimeDuration, Event = Mul_end, Time = 0.000120 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.277438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.277463 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.277478 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.277526 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.415656, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.322233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.323190 +INFO: TimeDuration, Event = Add_end, Time = 0.000957 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.323218 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.324095 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000877 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.324120 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.326994 +INFO: TimeDuration, Event = Pool_end, Time = 0.002874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.354302 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.355059 +INFO: TimeDuration, Event = Add_end, Time = 0.000757 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.355073 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.355740 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000666 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.355752 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.358899 +INFO: TimeDuration, Event = Pool_end, Time = 0.003148 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.376669 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.377134 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.377148 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.377481 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.390125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.390436 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.390450 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.390680 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.404849 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.405154 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.405166 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.405397 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.405413 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.408142 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.408160 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.408257 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.408269 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.408289 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.408302 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.408369 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000066 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.074970, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.451141 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.452067 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.452083 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.452954 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.452969 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.455855 +INFO: TimeDuration, Event = Pool_end, Time = 0.002887 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.482996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.483720 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.483732 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.484383 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.484579 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.487595 +INFO: TimeDuration, Event = Pool_end, Time = 0.003015 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.505366 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.505833 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.505846 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.506180 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.518816 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.519125 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.519136 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.519367 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.533533 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.533838 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.533850 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.534081 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.534098 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.536828 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.536847 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.536945 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.536958 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.536981 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.536994 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.537039 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.454600, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.579786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.580711 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.580728 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.581593 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.581608 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.584499 +INFO: TimeDuration, Event = Pool_end, Time = 0.002891 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.611684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.612409 +INFO: TimeDuration, Event = Add_end, Time = 0.000725 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.612596 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.613244 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.613256 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.618372 +INFO: TimeDuration, Event = Pool_end, Time = 0.005117 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.636125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.638706 +INFO: TimeDuration, Event = Add_end, Time = 0.002580 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.638732 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.639084 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000352 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.649783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.650096 +INFO: TimeDuration, Event = Add_end, Time = 0.000313 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.650108 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.650337 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.665565 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.665871 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.665885 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.666118 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.666139 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.668864 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.668886 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.669007 +INFO: TimeDuration, Event = Mul_end, Time = 0.000121 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.669020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.669045 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.669059 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.669130 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000071 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.670549, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.711575 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.712506 +INFO: TimeDuration, Event = Add_end, Time = 0.000932 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.712520 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.713386 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000866 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.713399 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.716296 +INFO: TimeDuration, Event = Pool_end, Time = 0.002897 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.743474 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.744194 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.744207 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.744853 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.744865 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.748062 +INFO: TimeDuration, Event = Pool_end, Time = 0.003197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.765829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.766294 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.766307 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.766639 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.779252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.779559 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.779571 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.779802 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.793976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.794279 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.794292 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.794521 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.794536 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.797268 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.797288 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.797386 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.797399 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.797420 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.797433 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.797483 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.390191, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.839505 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.840430 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.840613 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.841477 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.841492 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.845235 +INFO: TimeDuration, Event = Pool_end, Time = 0.003743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.871348 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.872069 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.872083 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.872737 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000654 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.872750 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.879116 +INFO: TimeDuration, Event = Pool_end, Time = 0.006366 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.897012 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.897479 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.897492 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.897833 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.915737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.916049 +INFO: TimeDuration, Event = Add_end, Time = 0.000313 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.916063 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.916293 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.931889 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.932196 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.932209 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.932442 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.932577 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.935180 +INFO: TimeDuration, Event = Pool_end, Time = 0.002603 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352768.935199 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352768.935297 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.935309 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.935329 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352768.935342 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352768.935391 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 104.958454, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352768.978168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352768.979095 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352768.979109 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352768.979974 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352768.979987 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352768.982889 +INFO: TimeDuration, Event = Pool_end, Time = 0.002902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.010272 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.010993 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.011007 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.011657 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.011669 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.014857 +INFO: TimeDuration, Event = Pool_end, Time = 0.003188 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.033695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.034163 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.034177 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.034513 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.047098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.047408 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.047422 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.047653 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.061810 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.062114 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.062127 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.062361 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.062395 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.065106 +INFO: TimeDuration, Event = Pool_end, Time = 0.002711 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.065126 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.065224 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.065237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.065259 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.065273 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.065316 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.661260, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.109327 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.110253 +INFO: TimeDuration, Event = Add_end, Time = 0.000926 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.110267 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.111125 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.111137 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.114045 +INFO: TimeDuration, Event = Pool_end, Time = 0.002908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.141227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.141950 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.141979 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.142628 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.142639 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.145823 +INFO: TimeDuration, Event = Pool_end, Time = 0.003184 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.163612 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.164085 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.164099 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.164441 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000343 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.177076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.177383 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.177396 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.177624 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.191775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.192078 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.192089 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.192332 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000243 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.192583 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.195071 +INFO: TimeDuration, Event = Pool_end, Time = 0.002488 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.195091 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.195189 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.195201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.195225 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.195240 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.195287 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000048 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.325373, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.242850 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.243813 +INFO: TimeDuration, Event = Add_end, Time = 0.000963 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.243830 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.244692 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000862 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.244710 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.247567 +INFO: TimeDuration, Event = Pool_end, Time = 0.002857 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.274700 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.275438 +INFO: TimeDuration, Event = Add_end, Time = 0.000738 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.275451 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.276098 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.276109 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.279286 +INFO: TimeDuration, Event = Pool_end, Time = 0.003177 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.297070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.297538 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.297552 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.297888 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.310530 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.310838 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.310850 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.311079 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.325269 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.325573 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.325586 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.325816 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.325832 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.328567 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.328587 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.328685 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.328698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.328719 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.328733 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.328858 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000125 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.496038, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.371429 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.372353 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.372367 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.373237 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.373251 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.376143 +INFO: TimeDuration, Event = Pool_end, Time = 0.002892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.403318 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.404039 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.404052 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.404702 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000650 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.404716 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.407910 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.425672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.426136 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.426149 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.426483 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.439087 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.439394 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.439406 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.439636 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.453803 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.454107 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.454119 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.454348 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.454380 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.457094 +INFO: TimeDuration, Event = Pool_end, Time = 0.002714 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.457114 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.457212 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.457224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.457246 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.457259 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.457304 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.411006, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.494538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.495458 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.495473 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.496343 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000870 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.496355 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.499222 +INFO: TimeDuration, Event = Pool_end, Time = 0.002868 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.526370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.527089 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.527103 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.527749 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.527761 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.530964 +INFO: TimeDuration, Event = Pool_end, Time = 0.003204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.548743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.549223 +INFO: TimeDuration, Event = Add_end, Time = 0.000480 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.549237 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.549574 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.562200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.562509 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.562537 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.562768 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.576919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.577221 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.577233 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.577462 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.577478 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.580212 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.580231 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.580331 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.580581 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.580606 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.580620 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.580665 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.457044, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.618068 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.619032 +INFO: TimeDuration, Event = Add_end, Time = 0.000965 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.619048 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.619912 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.619925 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.622788 +INFO: TimeDuration, Event = Pool_end, Time = 0.002862 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.649959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.650681 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.650693 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.651345 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.651355 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.654548 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.674421 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.674893 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.674922 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.675257 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.687892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.688201 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.688214 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.688445 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.702630 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.702935 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.702948 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.703176 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.703194 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.705926 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.705946 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.706045 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.706057 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.706079 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.706091 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.706134 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 98.011342, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.745927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.746855 +INFO: TimeDuration, Event = Add_end, Time = 0.000928 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.746869 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.747733 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.747746 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.750647 +INFO: TimeDuration, Event = Pool_end, Time = 0.002901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.777785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.778506 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.778518 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.779163 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000644 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.779174 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.782374 +INFO: TimeDuration, Event = Pool_end, Time = 0.003200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.800133 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.800597 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.800612 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.800947 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.813587 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.813896 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.813908 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.814136 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.828279 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.828584 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.828597 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.828826 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.828844 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.831576 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.831595 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.831692 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.831705 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.831726 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.831739 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.831782 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 96.578383, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.869188 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.870113 +INFO: TimeDuration, Event = Add_end, Time = 0.000925 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.870128 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.870989 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.871002 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.873909 +INFO: TimeDuration, Event = Pool_end, Time = 0.002907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.901074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.901793 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.901807 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.902456 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.902466 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.905659 +INFO: TimeDuration, Event = Pool_end, Time = 0.003192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.923419 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.923886 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.923900 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.924233 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.936890 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.937198 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.937210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.937441 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.951583 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.951887 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352769.951898 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352769.952133 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352769.952149 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352769.954876 +INFO: TimeDuration, Event = Pool_end, Time = 0.002727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352769.954896 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352769.954994 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352769.955007 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352769.955029 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352769.955042 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352769.955084 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 81.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.446189, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.000905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.001876 +INFO: TimeDuration, Event = Add_end, Time = 0.000971 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.001894 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.002755 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000861 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.002768 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.005607 +INFO: TimeDuration, Event = Pool_end, Time = 0.002839 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.032839 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.033557 +INFO: TimeDuration, Event = Add_end, Time = 0.000719 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.033570 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.034217 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000647 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.034229 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.037427 +INFO: TimeDuration, Event = Pool_end, Time = 0.003199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.056246 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.056713 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.056728 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.057065 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.069725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.070034 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.070046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.070276 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.084454 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.084758 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.084770 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.085003 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.085018 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.087748 +INFO: TimeDuration, Event = Pool_end, Time = 0.002729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.087767 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.087863 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.087876 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.087897 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.087909 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.087977 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000068 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 105.011521, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.127749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.128670 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.128688 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.129552 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.129565 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.132471 +INFO: TimeDuration, Event = Pool_end, Time = 0.002906 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.159697 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.160415 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.160426 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.161071 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000645 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.161083 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.164291 +INFO: TimeDuration, Event = Pool_end, Time = 0.003208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.182060 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.182526 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.182538 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.182873 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.195527 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.195834 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.195847 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.196075 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.210259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.210564 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.210577 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.210805 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.210821 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.213555 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.213575 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.213672 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.213685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.213706 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.213720 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.213763 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 97.443060, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.251749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.252663 +INFO: TimeDuration, Event = Add_end, Time = 0.000914 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.252681 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.253541 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000859 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.253555 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.256464 +INFO: TimeDuration, Event = Pool_end, Time = 0.002910 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.283743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.284466 +INFO: TimeDuration, Event = Add_end, Time = 0.000722 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.284476 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.285128 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000652 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.285141 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.288351 +INFO: TimeDuration, Event = Pool_end, Time = 0.003209 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.306106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.306572 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.306585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.306918 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.319545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.319854 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.319865 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.320096 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.334275 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.334579 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.334591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.334823 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.334839 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.337569 +INFO: TimeDuration, Event = Pool_end, Time = 0.002730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.337589 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.337687 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.337699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.337720 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.337733 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.337777 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 82.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.444106, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.375452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.376404 +INFO: TimeDuration, Event = Add_end, Time = 0.000952 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.376418 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.377275 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.377290 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.380174 +INFO: TimeDuration, Event = Pool_end, Time = 0.002884 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.407452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.408175 +INFO: TimeDuration, Event = Add_end, Time = 0.000723 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.408189 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.408840 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000651 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.408854 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.412042 +INFO: TimeDuration, Event = Pool_end, Time = 0.003188 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.429845 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.430311 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.430324 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.430658 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.443332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.443640 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.443652 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.443884 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.458068 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.458374 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.458386 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.458617 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.458633 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.461360 +INFO: TimeDuration, Event = Pool_end, Time = 0.002726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.461379 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.461475 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.461488 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.461510 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.461524 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.461569 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.705714, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.499318 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.500240 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.500256 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.501119 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000863 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.501134 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.504039 +INFO: TimeDuration, Event = Pool_end, Time = 0.002904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.531316 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.532037 +INFO: TimeDuration, Event = Add_end, Time = 0.000721 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.532052 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.532700 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.532714 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.535907 +INFO: TimeDuration, Event = Pool_end, Time = 0.003194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.553673 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.554137 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.554149 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.554483 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.567153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.567462 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.567474 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.567703 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.581874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.582179 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.582191 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.582422 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.582439 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.585170 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.585189 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.585287 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.585299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.585320 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.585334 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.585377 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 80.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.643167, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.622747 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.623672 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.623686 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.624552 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.624597 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.627464 +INFO: TimeDuration, Event = Pool_end, Time = 0.002867 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.654707 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.655427 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.655440 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.656085 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000646 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.656098 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.659297 +INFO: TimeDuration, Event = Pool_end, Time = 0.003199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.680789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.681257 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.681270 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.681606 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.694441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.694748 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.694760 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.694989 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.709224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.709528 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.709541 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.709770 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.709788 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.712519 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.712570 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.712669 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.712682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.712703 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.712717 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.712766 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 79.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.519238, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.755507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.756481 +INFO: TimeDuration, Event = Add_end, Time = 0.000974 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.756616 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.757480 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000865 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.757493 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.761279 +INFO: TimeDuration, Event = Pool_end, Time = 0.003786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.787415 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.788135 +INFO: TimeDuration, Event = Add_end, Time = 0.000720 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.788146 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.788795 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000649 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.788809 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.792007 +INFO: TimeDuration, Event = Pool_end, Time = 0.003198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.809779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.810245 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.810274 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.810606 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.823273 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.823581 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.823594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.823822 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.837996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.838299 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.838313 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.838543 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.838559 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.841293 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.841312 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.841414 +INFO: TimeDuration, Event = Mul_end, Time = 0.000102 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.841428 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.841450 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.841464 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.841506 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 78.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 99.043276, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.878670 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.879589 +INFO: TimeDuration, Event = Add_end, Time = 0.000920 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.879604 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.880592 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000987 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.880609 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.884442 +INFO: TimeDuration, Event = Pool_end, Time = 0.003834 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 98304000 +DEBUG: Attempting to Allocate = 98304000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 49152, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.910615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 24576000 +INFO: bias->num_elems = 192 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.911333 +INFO: TimeDuration, Event = Add_end, Time = 0.000718 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.911347 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.911995 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000648 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.912006 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 192, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 24576000 +DEBUG: Attempting to Allocate = 24576000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 12288, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.915206 +INFO: TimeDuration, Event = Pool_end, Time = 0.003199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 49152000 +DEBUG: Attempting to Allocate = 49152000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 24576, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.933003 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 12288000 +INFO: bias->num_elems = 384 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.933467 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.933479 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.933814 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.946466 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.946774 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.946785 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.947016 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.961226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.961531 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352770.961543 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352770.961772 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352770.961787 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352770.964537 +INFO: TimeDuration, Event = Pool_end, Time = 0.002750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352770.964678 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 4096 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352770.964797 +INFO: TimeDuration, Event = Mul_end, Time = 0.000119 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352770.964812 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352770.964851 +INFO: TimeDuration, Event = Add_end, Time = 0.000039 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352770.964865 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352770.964910 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 76.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 95.737806, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +Exiting profiler +INFO: Writing Runtime Profile Info File... +INFO: Done writing profile. diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/predictive/alexnet.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/predictive/alexnet.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9ccba6eb63f620c0e3b6f95fd7c50892018f00f --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet/predictive/alexnet.txt @@ -0,0 +1,511 @@ +2592.187221 ++++++ +conf1 1 1 79.28 0.0 +1 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +2 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 tanh fp32 1 +4 gpu conv fp32 11 add fp32 1 tanh fp32 1 +5 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +6 gpu mul fp32 11 add fp32 1 +7 gpu softmax fp32 1 +----- ++++++ +conf2 1.7593976485873195 1.6193399031642917 79.23 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf3 2.092625440752526 1.9139078015388271 78.96 0.3200000000000074 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf4 1.8870195448805414 1.7296919053025768 78.8 0.480000000000004 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf5 2.1184804041774554 1.9598989563949536 78.75999999999999 0.5200000000000102 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf6 2.1184804041774554 1.9598989563949536 78.75999999999999 0.5200000000000102 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf7 2.0933825381386364 1.9150743378318535 78.64 0.6400000000000006 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf8 2.081712090729918 1.9102226906341664 78.5 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf9 2.081712090729918 1.9102226906341664 78.5 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf10 2.2662606588487595 2.066560750795139 78.48 0.7999999999999972 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf11 2.121684761285686 1.966318179285323 78.48 0.7999999999999972 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf12 2.3417491169395532 2.1355030360671465 78.38000000000001 0.8999999999999915 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf13 2.2247938983110425 2.060416584958474 78.38000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf14 2.2247938983110425 2.060416584958474 78.38000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf15 2.2247938983110425 2.060416584958474 78.38000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf16 2.2627828537139263 2.065683616898884 78.32000000000001 0.9599999999999937 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf17 2.2627828537139263 2.065683616898884 78.32000000000001 0.9599999999999937 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf18 2.2627828537139263 2.065683616898884 78.32000000000001 0.9599999999999937 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf19 2.146571989407323 1.95711703610764 78.18 1.0999999999999943 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf20 2.303316973793268 2.1036463961913276 78.10000000000001 1.1799999999999926 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf21 2.436875653706139 2.2434837737118056 78.08 1.2000000000000028 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf22 2.436875653706139 2.2434837737118056 78.08 1.2000000000000028 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf23 2.436875653706139 2.2434837737118056 78.08 1.2000000000000028 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf24 2.1106508925330925 1.9419233584234938 78.06 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf25 2.3203534290038634 2.116965679235447 78.06 1.2199999999999989 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf26 2.3527290658539215 2.145832257234814 78.03999999999999 1.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf27 2.3527290658539215 2.145832257234814 78.03999999999999 1.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf28 2.432854949808342 2.2424500615508003 78.0 1.2800000000000011 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf29 2.432854949808342 2.2424500615508003 78.0 1.2800000000000011 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf30 2.432854949808342 2.2424500615508003 78.0 1.2800000000000011 +1 gpu conv samp_fp16 263 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf31 2.3137982135449207 2.1281257317083417 77.84 1.4399999999999977 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 265 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf32 2.1198074418988333 1.9522214255218437 77.82 1.460000000000008 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf33 2.246924974355375 2.065289762405701 77.8 1.480000000000004 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 269 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf34 2.263614734554485 2.090777846534249 77.74 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf35 2.263614734554485 2.090777846534249 77.74 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf36 2.263614734554485 2.090777846534249 77.74 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf37 2.5289288699015304 2.334007588396142 77.72 1.5600000000000023 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf38 2.5289288699015304 2.334007588396142 77.72 1.5600000000000023 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf39 2.3117594882585775 2.1152397180868943 77.56 1.7199999999999989 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf40 2.452732477854469 2.264573687601476 77.56 1.7199999999999989 +1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf41 2.452732477854469 2.264573687601476 77.56 1.7199999999999989 +1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf42 2.382518688546389 2.178614303992064 77.5 1.7800000000000011 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf43 2.382518688546389 2.178614303992064 77.5 1.7800000000000011 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf44 2.3900667100485924 2.188128526401265 77.48 1.7999999999999972 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf45 2.3900667100485924 2.188128526401265 77.48 1.7999999999999972 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf46 2.3900667100485924 2.188128526401265 77.48 1.7999999999999972 +1 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf47 2.4835281673276515 2.279527076032239 77.3 1.980000000000004 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf48 2.4835281673276515 2.279527076032239 77.3 1.980000000000004 +1 gpu conv samp_fp16 264 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf49 2.1553694968551302 1.9959124044028933 77.18 2.0999999999999943 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 265 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf50 2.5877520959724816 2.3763616521050364 77.03999999999999 2.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- ++++++ +conf51 2.5877520959724816 2.3763616521050364 77.03999999999999 2.240000000000009 +1 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 +7 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile index fc0495605d254cf5bbefb67502e9cd18b164c6d0..cae7df33de24e4d20e2d0d2b0977709a1865c9a8 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/Makefile @@ -40,6 +40,11 @@ CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/ VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -63,14 +68,20 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/run_data/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/run_data/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..daddbdd93b29c04a381f5ec8101c66d3498c396e --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/data/run_data/out-run-1 @@ -0,0 +1,45868 @@ +size_in_bytes = 3456 +DEBUG: ***--- size_in_bytes = 3456 +DEBUG: Attempting to Allocate = 3456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 27, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 36864 +DEBUG: ***--- size_in_bytes = 36864 +DEBUG: Attempting to Allocate = 36864 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 73728 +DEBUG: ***--- size_in_bytes = 73728 +DEBUG: Attempting to Allocate = 73728 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 294912 +DEBUG: ***--- size_in_bytes = 294912 +DEBUG: Attempting to Allocate = 294912 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 512 +DEBUG: ***--- size_in_bytes = 512 +DEBUG: Attempting to Allocate = 512 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 128, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 589824 +DEBUG: ***--- size_in_bytes = 589824 +DEBUG: Attempting to Allocate = 589824 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 1152, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 512 +DEBUG: ***--- size_in_bytes = 512 +DEBUG: Attempting to Allocate = 512 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 128, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 81920 +DEBUG: ***--- size_in_bytes = 81920 +DEBUG: Attempting to Allocate = 81920 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 20480, cStride = 20480, hStride = 10, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 40 +DEBUG: ***--- size_in_bytes = 40 +DEBUG: Attempting to Allocate = 40 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INITIALIZING GPU 0 +CREATED HANDLES 0 +INFO: +WARNING: File 'opentuner_flags' not found + + +initializing tuner .... +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +Read PROMISE FLAGS 0 +DONE INTIALIZING GPU 0 +INFO: Reading Quantization Ranges File... +INFO: DONE. +INFO: Reading Configuration File... +DEBUG: first_line: 2000 +DEBUG: Baseline time: 2000.000000 + +DEBUG: line: +++++ +DEBUG: t: +++++ +DEBUG: +DEBUG: line: conf1 2.64294896823 0 84.24999995 -0.05999995000000524 +DEBUG: t: conf1 +DEBUG: t: 2.64294896823 +DEBUG: t: 0 +DEBUG: t: 84.24999995 +DEBUG: t: -0.05999995000000524 +DEBUG: +DEBUG: line: 1 gpu conv fp32 1 add fp32 1 tanh fp32 1 +DEBUG: t: 1 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 1 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 2 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +DEBUG: t: 2 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 4 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 3 gpu conv fp32 1 add fp32 1 tanh fp32 1 +DEBUG: t: 3 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 8 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 4 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +DEBUG: t: 4 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 11 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 5 gpu conv fp32 1 add fp32 1 tanh fp32 1 +DEBUG: t: 5 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 15 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 6 gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +DEBUG: t: 6 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: tanh +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 18 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found tanh operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 7 gpu mul fp32 1 add fp32 1 +DEBUG: t: 7 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 22 + +DEBUG: Found mul operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 8 gpu softmax fp32 1 +DEBUG: t: 8 +DEBUG: t: gpu +DEBUG: t: softmax +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 24 + +DEBUG: Found softmax operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: ----- +DEBUG: t: ----- +DEBUG: +DEBUG: DONE. +INFO: Sorting autotuner configurations... +INFO: Done sorting. +INFO: Speedup Configurations ++++++ +conf1 2.642949 0.000000 84.250000 -0.060000 +1 : gpu conv fp32 1 add fp32 1 tanh fp32 1 +2 : gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 : gpu conv fp32 1 add fp32 1 tanh fp32 1 +4 : gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +5 : gpu conv fp32 1 add fp32 1 tanh fp32 1 +6 : gpu conv fp32 1 add fp32 1 tanh fp32 1 pool_max fp32 1 +7 : gpu mul fp32 1 add fp32 1 +8 : gpu softmax fp32 1 +----- +DEBUG: slowdowns file not found. Initializing slowdowns randomly. +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +WARNING: pause_profiler was already called +Initializing policy object ... +DONE: Initializing policy object. +Select target device (0 for CPU, 1 fpr GPU): DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +INFO: Moving 3456 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.841121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.841637 +INFO: TimeDuration, Event = Add_end, Time = 0.000516 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.841664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.842129 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000465 +DEBUG: No data movement required - Data on Device +INFO: Moving 36864 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.851054 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.851532 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.851560 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.852004 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352772.852022 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352772.855581 +INFO: TimeDuration, Event = Pool_end, Time = 0.003559 +DEBUG: No data movement required - Data on Device +INFO: Moving 73728 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.868034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.868295 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.868323 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.868559 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000236 +DEBUG: No data movement required - Data on Device +INFO: Moving 147456 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.875953 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.876213 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.876230 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.876467 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352772.876485 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352772.879251 +INFO: TimeDuration, Event = Pool_end, Time = 0.002767 +DEBUG: No data movement required - Data on Device +INFO: Moving 294912 bytes from host to GPU +INFO: Moving 512 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.884984 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.885151 +INFO: TimeDuration, Event = Add_end, Time = 0.000167 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.885169 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.885298 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +INFO: Moving 589824 bytes from host to GPU +INFO: Moving 512 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.889602 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.889770 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.889787 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.889917 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352772.889933 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352772.892838 +INFO: TimeDuration, Event = Pool_end, Time = 0.002905 +DEBUG: No data movement required - Data on Device +INFO: Moving 81920 bytes from host to GPU +INFO: Moving 40 bytes from host to GPU +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352772.893256 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352772.893342 +INFO: TimeDuration, Event = Mul_end, Time = 0.000086 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.893363 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.893391 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352772.893408 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352772.893481 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000073 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 43.610630, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.975309 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.975835 +INFO: TimeDuration, Event = Add_end, Time = 0.000527 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.975858 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.976366 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000508 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352772.984446 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352772.984915 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352772.984939 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352772.985381 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352772.985399 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352772.988981 +INFO: TimeDuration, Event = Pool_end, Time = 0.003582 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.000540 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.000840 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.000864 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.001098 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.008255 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.008515 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.008539 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.008776 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000236 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.008791 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.011551 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.017116 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.017285 +INFO: TimeDuration, Event = Add_end, Time = 0.000169 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.017302 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.017430 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.021492 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.021660 +INFO: TimeDuration, Event = Add_end, Time = 0.000167 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.021676 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.021804 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.021822 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.024728 +INFO: TimeDuration, Event = Pool_end, Time = 0.002906 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.024752 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.024835 +INFO: TimeDuration, Event = Mul_end, Time = 0.000083 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.024851 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.024911 +INFO: TimeDuration, Event = Add_end, Time = 0.000059 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.024933 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.025029 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000096 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 52.857739, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.116424 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.116929 +INFO: TimeDuration, Event = Add_end, Time = 0.000504 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.116951 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.117395 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.125124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.125604 +INFO: TimeDuration, Event = Add_end, Time = 0.000480 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.125627 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.126073 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000446 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.126095 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.129616 +INFO: TimeDuration, Event = Pool_end, Time = 0.003520 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.141210 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.141472 +INFO: TimeDuration, Event = Add_end, Time = 0.000262 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.141494 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.141731 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.148902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.149162 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.149178 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.149414 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.149429 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.152205 +INFO: TimeDuration, Event = Pool_end, Time = 0.002776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.157743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.157909 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.157926 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.158055 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.162129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.162295 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.162311 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.162441 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.162455 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.165367 +INFO: TimeDuration, Event = Pool_end, Time = 0.002912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.165392 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.165476 +INFO: TimeDuration, Event = Mul_end, Time = 0.000084 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.165494 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.165519 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.165538 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.165611 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000073 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 52.628632, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.234672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.235173 +INFO: TimeDuration, Event = Add_end, Time = 0.000501 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.235400 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.235855 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000455 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.245052 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.245534 +INFO: TimeDuration, Event = Add_end, Time = 0.000482 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.245555 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.245996 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.246013 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.249519 +INFO: TimeDuration, Event = Pool_end, Time = 0.003505 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.261089 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.261351 +INFO: TimeDuration, Event = Add_end, Time = 0.000262 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.261377 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.261615 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.268774 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.269035 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.269053 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.269287 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.269303 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.272074 +INFO: TimeDuration, Event = Pool_end, Time = 0.002771 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.277610 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.277777 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.277794 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.277923 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.281910 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.282076 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.282093 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.282222 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.282238 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.285149 +INFO: TimeDuration, Event = Pool_end, Time = 0.002912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.285175 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.285258 +INFO: TimeDuration, Event = Mul_end, Time = 0.000083 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.285274 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.285300 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.285317 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.285372 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000055 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 53.728972, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.354815 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.355312 +INFO: TimeDuration, Event = Add_end, Time = 0.000498 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.355343 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.355801 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000458 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.364697 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.365167 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.365188 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.365627 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.365645 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.369167 +INFO: TimeDuration, Event = Pool_end, Time = 0.003522 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.380760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.381025 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.381048 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.381281 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.388501 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.388758 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.388776 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.389011 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.389027 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.391803 +INFO: TimeDuration, Event = Pool_end, Time = 0.002776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.397370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.397538 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.397555 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.397682 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.401699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.401865 +INFO: TimeDuration, Event = Add_end, Time = 0.000167 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.401883 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.402013 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.402031 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.404932 +INFO: TimeDuration, Event = Pool_end, Time = 0.002902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.404959 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.405042 +INFO: TimeDuration, Event = Mul_end, Time = 0.000083 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.405059 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.405084 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.405103 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.405163 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000060 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 53.310803, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.478323 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.478834 +INFO: TimeDuration, Event = Add_end, Time = 0.000511 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.478872 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.479331 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000460 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.489196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.489685 +INFO: TimeDuration, Event = Add_end, Time = 0.000489 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.489707 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.490151 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.490168 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.493656 +INFO: TimeDuration, Event = Pool_end, Time = 0.003488 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.505253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.505520 +INFO: TimeDuration, Event = Add_end, Time = 0.000267 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.505545 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.505781 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.512957 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.513214 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.513231 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.513465 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.513480 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.516266 +INFO: TimeDuration, Event = Pool_end, Time = 0.002786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.521826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.521995 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.522011 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.522142 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.526128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.526295 +INFO: TimeDuration, Event = Add_end, Time = 0.000167 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.526311 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.526441 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.526456 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.529366 +INFO: TimeDuration, Event = Pool_end, Time = 0.002910 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.529390 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.529474 +INFO: TimeDuration, Event = Mul_end, Time = 0.000083 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.529491 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.529516 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.529534 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.529588 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000054 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 56.370145, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.598669 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.599179 +INFO: TimeDuration, Event = Add_end, Time = 0.000510 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.599221 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.599696 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000475 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.609380 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.609871 +INFO: TimeDuration, Event = Add_end, Time = 0.000490 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.609905 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.610360 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000454 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.610391 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.613854 +INFO: TimeDuration, Event = Pool_end, Time = 0.003463 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.625584 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.625851 +INFO: TimeDuration, Event = Add_end, Time = 0.000267 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.625875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.626113 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.633880 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.634145 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.634175 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.634418 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000243 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.634446 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.637179 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.643035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.643212 +INFO: TimeDuration, Event = Add_end, Time = 0.000177 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.643243 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.643384 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000141 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.647758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.647933 +INFO: TimeDuration, Event = Add_end, Time = 0.000175 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.647964 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.648103 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000139 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.648127 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.650978 +INFO: TimeDuration, Event = Pool_end, Time = 0.002851 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.651005 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.651087 +INFO: TimeDuration, Event = Mul_end, Time = 0.000082 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.651105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.651131 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.651149 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.651204 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000056 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 55.789074, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.720960 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.721474 +INFO: TimeDuration, Event = Add_end, Time = 0.000514 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.721605 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.722070 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.732606 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.733107 +INFO: TimeDuration, Event = Add_end, Time = 0.000501 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.733148 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.733606 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000458 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.733637 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.736802 +INFO: TimeDuration, Event = Pool_end, Time = 0.003166 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.748358 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.748620 +INFO: TimeDuration, Event = Add_end, Time = 0.000262 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.748642 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.748878 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000236 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.756053 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.756314 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.756483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.756715 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.756730 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.759351 +INFO: TimeDuration, Event = Pool_end, Time = 0.002620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.764919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.765087 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.765104 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.765234 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.769208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.769376 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.769392 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.769521 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.769536 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.772445 +INFO: TimeDuration, Event = Pool_end, Time = 0.002909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.772465 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.772547 +INFO: TimeDuration, Event = Mul_end, Time = 0.000082 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.772567 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.772592 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.772611 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.772664 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000054 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 54.458160, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.840330 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.840832 +INFO: TimeDuration, Event = Add_end, Time = 0.000503 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.840856 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.841284 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000428 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.849092 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.849568 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.849591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.850041 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000450 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.850058 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.853575 +INFO: TimeDuration, Event = Pool_end, Time = 0.003517 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.865153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.865413 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.865435 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.865670 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.872841 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.873102 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.873121 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.873354 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.873371 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.876138 +INFO: TimeDuration, Event = Pool_end, Time = 0.002768 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.884768 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.884957 +INFO: TimeDuration, Event = Add_end, Time = 0.000190 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.884977 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.885117 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000140 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.889057 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.889223 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.889240 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.889371 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.889387 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.892320 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352773.892488 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352773.892719 +INFO: TimeDuration, Event = Mul_end, Time = 0.000231 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.892737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.892773 +INFO: TimeDuration, Event = Add_end, Time = 0.000036 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352773.892791 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352773.892883 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000091 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 55.434243, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.966177 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.966688 +INFO: TimeDuration, Event = Add_end, Time = 0.000510 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.966711 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.967158 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000447 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.975125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.975598 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.975619 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.976062 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.976079 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352773.979609 +INFO: TimeDuration, Event = Pool_end, Time = 0.003530 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.991194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.991457 +INFO: TimeDuration, Event = Add_end, Time = 0.000263 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.991478 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.991712 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352773.998900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352773.999173 +INFO: TimeDuration, Event = Add_end, Time = 0.000273 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352773.999191 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352773.999424 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352773.999441 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.002198 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.007719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.007887 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.007903 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.008035 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.012019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.012185 +INFO: TimeDuration, Event = Add_end, Time = 0.000167 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.012201 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.012331 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.012492 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.015252 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.015277 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.015361 +INFO: TimeDuration, Event = Mul_end, Time = 0.000083 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.015378 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.015404 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.015421 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.015476 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000056 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.809966, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.037959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.038435 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.038455 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.038898 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.046881 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.047350 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.047372 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.047811 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.047828 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.051361 +INFO: TimeDuration, Event = Pool_end, Time = 0.003533 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.062967 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.063231 +INFO: TimeDuration, Event = Add_end, Time = 0.000264 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.063253 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.063487 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.070644 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.070901 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.070917 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.071151 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.071166 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.073941 +INFO: TimeDuration, Event = Pool_end, Time = 0.002775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.079458 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.079626 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.079690 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.079819 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.083747 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.083913 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.083930 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.084060 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.084074 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.086984 +INFO: TimeDuration, Event = Pool_end, Time = 0.002910 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.087010 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.087091 +INFO: TimeDuration, Event = Mul_end, Time = 0.000081 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.087108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.087133 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.087151 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.087205 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000054 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.318447, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.109318 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.109795 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.109814 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.110261 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000447 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.118222 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.118695 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.118714 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.119159 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.119181 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.122706 +INFO: TimeDuration, Event = Pool_end, Time = 0.003525 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.134301 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.134567 +INFO: TimeDuration, Event = Add_end, Time = 0.000266 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.134588 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.134822 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.142009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.142267 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.142283 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.142522 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000238 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.142537 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.145306 +INFO: TimeDuration, Event = Pool_end, Time = 0.002769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.150816 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.150985 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.151001 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.151132 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.155200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.155366 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.155383 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.155514 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.155529 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.158438 +INFO: TimeDuration, Event = Pool_end, Time = 0.002908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.158463 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.158546 +INFO: TimeDuration, Event = Mul_end, Time = 0.000082 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.158563 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.158588 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.158606 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.158660 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000054 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.417224, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.180244 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.180713 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.180733 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.181176 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.188718 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.189185 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.189200 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.189640 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.189654 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.193197 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.204663 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.204923 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.204940 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.205171 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.212121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.212376 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.212482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.212713 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.212727 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.215414 +INFO: TimeDuration, Event = Pool_end, Time = 0.002687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.220787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.220951 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.220964 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.221090 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.224986 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.225147 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.225160 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.225287 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.225298 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.228223 +INFO: TimeDuration, Event = Pool_end, Time = 0.002925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.228242 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.228315 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.228477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.228502 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.228516 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.228562 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.075457, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.251487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.251958 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.251974 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.252417 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.259950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.260418 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.260431 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.260870 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.260883 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.264427 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.275855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.276113 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.276130 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.276359 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.283340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.283598 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.283611 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.283841 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.283853 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.286634 +INFO: TimeDuration, Event = Pool_end, Time = 0.002781 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.292042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.292206 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.292220 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.292348 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.296156 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.296329 +INFO: TimeDuration, Event = Add_end, Time = 0.000173 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.296339 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.296464 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.296476 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.299390 +INFO: TimeDuration, Event = Pool_end, Time = 0.002914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.299410 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.299481 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.299494 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.299516 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.299530 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.299576 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.809200, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.336605 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.337075 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.337092 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.337531 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.344839 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.345333 +INFO: TimeDuration, Event = Add_end, Time = 0.000494 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.345349 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.345785 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.345800 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.349314 +INFO: TimeDuration, Event = Pool_end, Time = 0.003513 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.360797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.361057 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.361072 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.361302 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.368235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.368487 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.368502 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.368733 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.368746 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.371531 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.376908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.377071 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.377084 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.377212 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.381105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.381265 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.381278 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.381404 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.381417 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.384337 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.384354 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.384538 +INFO: TimeDuration, Event = Mul_end, Time = 0.000184 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.384553 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.384575 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.384589 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.384656 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000067 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.195098, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.405277 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.405747 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.405763 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.406204 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.413756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.414220 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.414237 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.414672 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.414685 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.418233 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.429663 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.429922 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.429939 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.430168 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.437119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.437371 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.437383 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.437613 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.437625 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.440414 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.445950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.446114 +INFO: TimeDuration, Event = Add_end, Time = 0.000165 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.446127 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.446254 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.450049 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.450210 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.450223 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.450355 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000132 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.450368 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.453284 +INFO: TimeDuration, Event = Pool_end, Time = 0.002917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.453304 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.453376 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.453390 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.453412 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.453426 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.453473 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000047 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.927601, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.506349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.506818 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.506834 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.507276 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.514799 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.515263 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.515279 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.515716 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.515730 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.519277 +INFO: TimeDuration, Event = Pool_end, Time = 0.003546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.530785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.531044 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.531076 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.531307 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.538235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.538486 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.538499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.538729 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.538742 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.541530 +INFO: TimeDuration, Event = Pool_end, Time = 0.002787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.546939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.547102 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.547115 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.547245 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.551048 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.551209 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.551221 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.551347 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.551359 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.554282 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.554302 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.554374 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.554387 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.554409 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.554424 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.554469 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.250189, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.575080 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.575549 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.575565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.576005 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.583568 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.584038 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.584054 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.584491 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.584618 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.588043 +INFO: TimeDuration, Event = Pool_end, Time = 0.003426 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.599476 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.599734 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.599753 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.599982 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.606911 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.607161 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.607174 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.607404 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.607416 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.610205 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.615588 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.615750 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.615763 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.615889 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.619695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.619855 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.619867 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.619992 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.620004 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.622926 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.622946 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.623018 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.623032 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.623054 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.623067 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.623112 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.809923, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.643998 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.644469 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.644722 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.645160 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.652509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.652973 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.652988 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.653427 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.653440 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.656986 +INFO: TimeDuration, Event = Pool_end, Time = 0.003546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.668443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.668702 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.668720 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.668950 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.675908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.676159 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.676172 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.676404 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.676487 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.679205 +INFO: TimeDuration, Event = Pool_end, Time = 0.002717 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.684615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.684814 +INFO: TimeDuration, Event = Add_end, Time = 0.000199 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.684829 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.684958 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.688731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.688892 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.688905 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.689031 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.689042 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.691965 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.691985 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.692055 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.692070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.692092 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.692105 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.692150 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.959897, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.713195 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.713708 +INFO: TimeDuration, Event = Add_end, Time = 0.000513 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.713725 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.714178 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000453 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.721684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.722156 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.722172 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.722611 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.722624 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.726180 +INFO: TimeDuration, Event = Pool_end, Time = 0.003556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.737592 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.737870 +INFO: TimeDuration, Event = Add_end, Time = 0.000277 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.737886 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.738118 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.745059 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.745311 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.745325 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.745556 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.745569 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.748575 +INFO: TimeDuration, Event = Pool_end, Time = 0.003006 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.753740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.753904 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.753918 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.754045 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.758110 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.758273 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.758286 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.758415 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.758430 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.761345 +INFO: TimeDuration, Event = Pool_end, Time = 0.002915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.761365 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.761437 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.761450 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.761494 +INFO: TimeDuration, Event = Add_end, Time = 0.000044 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.761509 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.761569 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000060 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.345173, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.782637 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.783107 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.783123 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.783564 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.791149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.791617 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.791632 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.792073 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.792086 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.795613 +INFO: TimeDuration, Event = Pool_end, Time = 0.003527 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.807040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.807298 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.807314 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.807546 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.814499 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.814750 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.814764 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.814994 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.815006 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.817794 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.823186 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.823351 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.823364 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.823493 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.827287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.827448 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.827461 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.827588 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.827599 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.830521 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.830540 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.830611 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.830624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.830645 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.830659 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.830703 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.984216, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.851479 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.851947 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.851962 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.852405 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.860068 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.860536 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.860553 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.860991 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.861004 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.864542 +INFO: TimeDuration, Event = Pool_end, Time = 0.003538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.875979 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.876239 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.876256 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.876487 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.883419 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.883671 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.883685 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.883914 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.883927 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.886716 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.892105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.892268 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.892281 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.892411 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.896230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.896392 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.896461 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.896585 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.896597 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.899464 +INFO: TimeDuration, Event = Pool_end, Time = 0.002866 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.899483 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.899557 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.899570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.899592 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.899606 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.899651 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.651008, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.920584 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.921055 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.921073 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.921518 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.929044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.929510 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.929525 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.929964 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.929976 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.933522 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.944980 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.945239 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.945256 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.945486 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.952484 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.952735 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.952749 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.952981 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.952993 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.955777 +INFO: TimeDuration, Event = Pool_end, Time = 0.002784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.961153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.961315 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.961328 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.961455 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.965250 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.965411 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.965424 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.965548 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.965560 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352774.968485 +INFO: TimeDuration, Event = Pool_end, Time = 0.002925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352774.968505 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352774.968578 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.968592 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.968613 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352774.968627 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352774.968672 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.980030, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.989364 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.989837 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.989852 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.990292 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352774.997909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352774.998377 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352774.998392 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352774.998831 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352774.998842 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.002338 +INFO: TimeDuration, Event = Pool_end, Time = 0.003496 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.013782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.014040 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.014058 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.014286 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.021231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.021483 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.021496 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.021726 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.021738 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.024749 +INFO: TimeDuration, Event = Pool_end, Time = 0.003011 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.029939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.030102 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.030116 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.030242 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.034038 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.034203 +INFO: TimeDuration, Event = Add_end, Time = 0.000165 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.034215 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.034342 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.034355 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.037271 +INFO: TimeDuration, Event = Pool_end, Time = 0.002916 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.037290 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.037360 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.037374 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.037395 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.037409 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.037492 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.049396, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.058073 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.058542 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.058559 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.058998 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.066522 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.066987 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.067003 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.067439 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.067453 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.071003 +INFO: TimeDuration, Event = Pool_end, Time = 0.003549 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.082432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.082691 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.082708 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.082936 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.089883 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.090136 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.090149 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.090380 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.090392 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.093179 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.098562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.098724 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.098740 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.098866 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.102672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.102833 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.102846 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.102972 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.102985 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.105904 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.105923 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.105995 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.106009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.106030 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.106043 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.106088 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.185160, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.126891 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.127355 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.127369 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.127812 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.135372 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.135838 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.135854 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.136292 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.136312 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.139848 +INFO: TimeDuration, Event = Pool_end, Time = 0.003536 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.151284 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.151541 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.151558 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.151788 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.158740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.158992 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.159005 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.159237 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.159249 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.162037 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.167408 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.167570 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.167583 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.167714 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.171596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.171757 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.171770 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.171895 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.171907 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.174830 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.174850 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.174921 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.174935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.174956 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.174970 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.175014 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.998599, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.195736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.196206 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.196221 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.196662 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.204216 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.204685 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.204703 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.205144 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.205166 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.208809 +INFO: TimeDuration, Event = Pool_end, Time = 0.003643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.220130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.220390 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.220539 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.220767 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.227567 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.227819 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.227832 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.228065 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.228076 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.230863 +INFO: TimeDuration, Event = Pool_end, Time = 0.002786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.238792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.238958 +INFO: TimeDuration, Event = Add_end, Time = 0.000165 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.238971 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.239099 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.242930 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.243093 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.243107 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.243237 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.243250 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.246163 +INFO: TimeDuration, Event = Pool_end, Time = 0.002914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.246183 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.246255 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.246268 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.246289 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.246304 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.246348 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 52.345020, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.267054 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.267522 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.267537 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.267981 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.275616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.276084 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.276100 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.276539 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.276608 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.280088 +INFO: TimeDuration, Event = Pool_end, Time = 0.003480 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.291525 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.291783 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.291799 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.292028 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.298954 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.299206 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.299219 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.299449 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.299461 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.302252 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.307662 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.307823 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.307837 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.307964 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.311769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.311929 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.311942 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.312069 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.312081 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.315003 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.315022 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.315094 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.315108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.315129 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.315143 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.315187 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.949986, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.335766 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.336236 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.336251 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.336694 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.344259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.344726 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.344742 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.345184 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.345198 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.348736 +INFO: TimeDuration, Event = Pool_end, Time = 0.003538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.360176 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.360436 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.360558 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.360787 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.367613 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.367872 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.367885 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.368115 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.368127 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.370908 +INFO: TimeDuration, Event = Pool_end, Time = 0.002781 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.376283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.376446 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.376475 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.376604 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.380575 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.380737 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.380749 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.380876 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.380888 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.383802 +INFO: TimeDuration, Event = Pool_end, Time = 0.002914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.383821 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.383893 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.383906 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.383927 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.383941 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.383986 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.077613, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.404507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.404976 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.404994 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.405435 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.412975 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.413444 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.413459 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.413899 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.413912 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.417490 +INFO: TimeDuration, Event = Pool_end, Time = 0.003578 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.428916 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.429177 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.429193 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.429425 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.436388 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.436641 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.436654 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.436886 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.436898 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.439681 +INFO: TimeDuration, Event = Pool_end, Time = 0.002783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.445059 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.445223 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.445236 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.445363 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.449177 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.449338 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.449352 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.449477 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.449490 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.452416 +INFO: TimeDuration, Event = Pool_end, Time = 0.002926 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.452515 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.452588 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.452601 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.452623 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.452636 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.452681 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.978399, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.473695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.474166 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.474181 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.474622 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.482126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.482590 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.482605 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.483043 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.483058 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.486603 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.498035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.498294 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.498311 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.498542 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.505496 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.505747 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.505760 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.505989 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.506001 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.508790 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.514208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.514372 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.514385 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.514512 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.518333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.518494 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.518507 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.518634 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.518646 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.521567 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.521587 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.521657 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.521671 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.521693 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.521707 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.521751 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.161965, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.544721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.545192 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.545209 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.545649 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.552969 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.553438 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.553455 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.553894 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.553905 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.557448 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.568879 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.569139 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.569157 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.569389 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.576332 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.576582 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.576594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.576825 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.576837 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.579624 +INFO: TimeDuration, Event = Pool_end, Time = 0.002787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.585006 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.585169 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.585181 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.585307 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.589168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.589329 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.589342 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.589467 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.589479 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.592407 +INFO: TimeDuration, Event = Pool_end, Time = 0.002928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.592479 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.592551 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.592565 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.592587 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.592639 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.592686 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000047 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.825245, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.613373 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.613850 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.613868 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.614313 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.621809 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.622275 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.622292 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.622730 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.622744 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.626289 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.637725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.637986 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.638002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.638233 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.645186 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.645440 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.645453 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.645686 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.645699 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.648500 +INFO: TimeDuration, Event = Pool_end, Time = 0.002801 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.653867 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.654029 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.654041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.654169 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.657984 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.658145 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.658158 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.658285 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.658297 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.661218 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.661238 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.661309 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.661322 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.661344 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.661357 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.661402 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.952807, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.684251 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.684728 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.684745 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.685189 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.692736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.693206 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.693221 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.693666 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.693679 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.697217 +INFO: TimeDuration, Event = Pool_end, Time = 0.003537 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.708692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.708952 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.708969 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.709199 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.716158 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.716410 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.716488 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.716718 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.716731 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.719453 +INFO: TimeDuration, Event = Pool_end, Time = 0.002721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.724874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.725038 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.725051 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.725178 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.728992 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.729154 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.729167 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.729294 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.729306 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.732226 +INFO: TimeDuration, Event = Pool_end, Time = 0.002920 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.732245 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.732318 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.732482 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.732507 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.732522 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.732567 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.206724, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.753042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.753510 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.753527 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.753967 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.761512 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.761978 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.761993 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.762431 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.762446 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.765987 +INFO: TimeDuration, Event = Pool_end, Time = 0.003542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.777420 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.777679 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.777695 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.777925 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.784913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.785164 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.785177 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.785406 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.785418 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.788207 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.793647 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.793810 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.793823 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.793949 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.797745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.797906 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.797920 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.798046 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.798058 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.800978 +INFO: TimeDuration, Event = Pool_end, Time = 0.002920 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.800997 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.801068 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.801081 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.801103 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.801117 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.801161 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.012407, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.821771 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.822243 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.822258 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.822697 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.830288 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.830755 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.830770 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.831208 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.831219 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.834766 +INFO: TimeDuration, Event = Pool_end, Time = 0.003546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.846198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.846457 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.846474 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.846706 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.853652 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.853906 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.853919 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.854149 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.854162 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.856949 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.862320 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.862483 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.862495 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.862623 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.866445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.866606 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.866619 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.866745 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.866756 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.869678 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.869699 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.869769 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.869782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.869804 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.869817 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.869862 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.135051, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.890379 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.890849 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.890865 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.891303 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.898846 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.899312 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.899326 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.899763 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.899777 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.903326 +INFO: TimeDuration, Event = Pool_end, Time = 0.003548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.914756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.915021 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.915037 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.915266 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.922196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.922448 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.922461 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.922692 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.922704 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.925494 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.930868 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.931031 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.931044 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.931170 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.934974 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.935136 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.935150 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.935276 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.935287 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.938209 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352775.938229 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352775.938300 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.938314 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.938335 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352775.938349 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352775.938394 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.199082, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.959280 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.959752 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.959766 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.960205 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.967771 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.968238 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.968256 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.968699 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.968717 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.972247 +INFO: TimeDuration, Event = Pool_end, Time = 0.003530 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.984258 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.984537 +INFO: TimeDuration, Event = Add_end, Time = 0.000278 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.984601 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.984844 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000243 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352775.992497 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352775.992754 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352775.992771 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352775.993005 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352775.993020 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352775.995785 +INFO: TimeDuration, Event = Pool_end, Time = 0.002765 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.001308 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.001474 +INFO: TimeDuration, Event = Add_end, Time = 0.000167 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.001491 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.001620 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.005590 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.005758 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.005774 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.005902 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.005917 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.008830 +INFO: TimeDuration, Event = Pool_end, Time = 0.002913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.008854 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.008935 +INFO: TimeDuration, Event = Mul_end, Time = 0.000081 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.008952 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.008977 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.008994 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.009044 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.753075, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.032121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.032596 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.032751 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.033192 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.041047 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.041519 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.041541 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.041984 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.042001 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.045530 +INFO: TimeDuration, Event = Pool_end, Time = 0.003528 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.057117 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.057382 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.057405 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.057637 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.064808 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.065064 +INFO: TimeDuration, Event = Add_end, Time = 0.000256 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.065081 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.065313 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.065328 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.068108 +INFO: TimeDuration, Event = Pool_end, Time = 0.002780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.073628 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.073796 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.073813 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.073942 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.077929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.078095 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.078112 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.078243 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.078258 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.081167 +INFO: TimeDuration, Event = Pool_end, Time = 0.002909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.081192 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.081273 +INFO: TimeDuration, Event = Mul_end, Time = 0.000081 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.081291 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.081317 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.081334 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.081386 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.353099, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.103192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.103669 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.103719 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.104160 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.112102 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.112573 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.112591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.113034 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.113051 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.116593 +INFO: TimeDuration, Event = Pool_end, Time = 0.003542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.128003 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.128264 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.128282 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.128515 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.135488 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.135740 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.135754 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.135984 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.135996 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.138783 +INFO: TimeDuration, Event = Pool_end, Time = 0.002787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.144162 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.144324 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.144335 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.144461 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.148477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.148638 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.148651 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.148776 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.148788 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.151616 +INFO: TimeDuration, Event = Pool_end, Time = 0.002828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.151635 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.151707 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.151720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.151783 +INFO: TimeDuration, Event = Add_end, Time = 0.000062 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.151797 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.151842 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.749790, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.172695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.173167 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.173185 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.173630 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.181153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.181619 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.181638 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.182072 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.182087 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.185632 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.197080 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.197340 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.197356 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.197587 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.204576 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.204827 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.204841 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.205069 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.205082 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.207850 +INFO: TimeDuration, Event = Pool_end, Time = 0.002768 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.213277 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.213441 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.213454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.213580 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.217455 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.217616 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.217629 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.217755 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.217767 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.220689 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.220709 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.220780 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.220793 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.220815 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.220828 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.220872 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.070002, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.242465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.242941 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.242957 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.243397 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.251008 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.251470 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.251486 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.251927 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.251940 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.255486 +INFO: TimeDuration, Event = Pool_end, Time = 0.003546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.266923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.267180 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.267198 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.267426 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.274369 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.274620 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.274633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.274863 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.274873 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.277665 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.283038 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.283199 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.283211 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.283338 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.287166 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.287327 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.287339 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.287464 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.287475 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.290400 +INFO: TimeDuration, Event = Pool_end, Time = 0.002925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.290419 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.290489 +INFO: TimeDuration, Event = Mul_end, Time = 0.000069 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.290502 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.290524 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.290539 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.290582 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.012053, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.311108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.311578 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.311594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.312034 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.319709 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.320172 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.320189 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.320626 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.320640 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.324189 +INFO: TimeDuration, Event = Pool_end, Time = 0.003549 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.335631 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.335889 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.335907 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.336137 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.343114 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.343366 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.343379 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.343610 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.343622 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.346398 +INFO: TimeDuration, Event = Pool_end, Time = 0.002776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.351778 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.351940 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.351952 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.352081 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.355905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.356066 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.356080 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.356207 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.356218 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.359143 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.359163 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.359234 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.359248 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.359271 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.359285 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.359335 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.067984, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.379843 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.380331 +INFO: TimeDuration, Event = Add_end, Time = 0.000488 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.380713 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.381154 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.388406 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.388870 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.388887 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.389326 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.389341 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.392886 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.404349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.404609 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.404628 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.404860 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.411812 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.412063 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.412076 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.412326 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000249 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.412336 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.415107 +INFO: TimeDuration, Event = Pool_end, Time = 0.002771 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.420496 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.420659 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.420672 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.420799 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.424650 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.424812 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.424826 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.424951 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.424963 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.427882 +INFO: TimeDuration, Event = Pool_end, Time = 0.002918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.427900 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.427971 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.427984 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.428006 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.428019 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.428063 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.761158, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.448698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.449164 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.449179 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.449614 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.457098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.457563 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.457594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.458033 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.458045 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.461574 +INFO: TimeDuration, Event = Pool_end, Time = 0.003528 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.473047 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.473307 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.473325 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.473556 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.480547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.480800 +INFO: TimeDuration, Event = Add_end, Time = 0.000253 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.480814 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.481042 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.481055 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.483841 +INFO: TimeDuration, Event = Pool_end, Time = 0.002787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.489241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.489403 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.489416 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.489543 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.493441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.493602 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.493615 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.493743 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.493755 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.496675 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.496694 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.496766 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.496780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.496802 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.496815 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.496858 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.057363, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.517194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.517664 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.517681 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.518121 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.525702 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.526164 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.526180 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.526651 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000470 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.526666 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.530177 +INFO: TimeDuration, Event = Pool_end, Time = 0.003511 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.541621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.541880 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.541897 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.542128 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.549108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.549360 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.549373 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.549603 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.549615 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.552407 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.557798 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.557962 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.557975 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.558101 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.561920 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.562081 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.562094 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.562221 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.562233 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.565155 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.565176 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.565248 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.565262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.565284 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.565298 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.565343 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.069347, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.585991 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.586464 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.586481 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.586921 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.594497 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.594961 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.594978 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.595416 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.595430 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.598968 +INFO: TimeDuration, Event = Pool_end, Time = 0.003537 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.610414 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.610672 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.610690 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.610924 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.617876 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.618127 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.618140 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.618370 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.618382 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.621174 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.626556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.626719 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.626734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.626861 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.630701 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.630863 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.630876 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.631001 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.631012 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.633934 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.633954 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.634026 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.634040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.634061 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.634075 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.634120 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.266615, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.654721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.655195 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.655212 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.655649 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.663259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.663725 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.663741 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.664180 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.664194 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.667734 +INFO: TimeDuration, Event = Pool_end, Time = 0.003540 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.679179 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.679439 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.679456 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.679686 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.686667 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.686919 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.686933 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.687163 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.687176 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.689961 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.695418 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.695593 +INFO: TimeDuration, Event = Add_end, Time = 0.000176 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.695607 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.695732 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.699616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.699776 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.699789 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.699915 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.699927 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.702849 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.702869 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.702976 +INFO: TimeDuration, Event = Mul_end, Time = 0.000107 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.702991 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.703012 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.703027 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.703072 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.263285, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.723593 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.724064 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.724080 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.724521 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.732118 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.732584 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.732601 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.733038 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.733051 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.736617 +INFO: TimeDuration, Event = Pool_end, Time = 0.003566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.748037 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.748297 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.748534 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.748761 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.755504 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.755755 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.755769 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.755998 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.756010 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.758797 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.764217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.764381 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.764480 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.764604 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.768520 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.768682 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.768696 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.768821 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.768835 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.771747 +INFO: TimeDuration, Event = Pool_end, Time = 0.002913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.771767 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.771839 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.771852 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.771875 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.771889 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.771934 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.987846, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.792711 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.793181 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.793197 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.793638 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.801071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.801534 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.801553 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.801989 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.802002 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.805547 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.816997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.817255 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.817273 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.817502 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.824469 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.824721 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.824735 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.824965 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.824977 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.827763 +INFO: TimeDuration, Event = Pool_end, Time = 0.002786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.833153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.833316 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.833329 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.833457 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.837296 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.837457 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.837470 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.837597 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.837610 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.840531 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.840551 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.840623 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.840636 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.840659 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.840672 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.840716 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.088471, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.861619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.862085 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.862102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.862545 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.870130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.870592 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.870607 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.871046 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.871060 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.874608 +INFO: TimeDuration, Event = Pool_end, Time = 0.003548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.886054 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.886313 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.886355 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.886585 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.893538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.893790 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.893804 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.894033 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.894046 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.896868 +INFO: TimeDuration, Event = Pool_end, Time = 0.002822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.902243 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.902405 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.902418 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.902548 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.906400 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.906562 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.906575 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.906699 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.906711 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.909632 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.909651 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.909722 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.909736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.909758 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.909772 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.909816 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.240827, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.933618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.934099 +INFO: TimeDuration, Event = Add_end, Time = 0.000481 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.934121 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.934567 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000446 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.942528 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.942998 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.943020 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.943459 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.943479 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.947008 +INFO: TimeDuration, Event = Pool_end, Time = 0.003529 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.958592 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.958856 +INFO: TimeDuration, Event = Add_end, Time = 0.000264 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.958878 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.959114 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.966311 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.966566 +INFO: TimeDuration, Event = Add_end, Time = 0.000256 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.966585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.966819 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.966836 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.969602 +INFO: TimeDuration, Event = Pool_end, Time = 0.002766 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.975126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.975295 +INFO: TimeDuration, Event = Add_end, Time = 0.000169 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.975313 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.975445 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.979433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.979602 +INFO: TimeDuration, Event = Add_end, Time = 0.000169 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352776.979620 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352776.979751 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352776.979769 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352776.982671 +INFO: TimeDuration, Event = Pool_end, Time = 0.002901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352776.982698 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352776.982781 +INFO: TimeDuration, Event = Mul_end, Time = 0.000082 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352776.982801 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352776.982828 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352776.982847 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352776.982901 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000053 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.351734, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.005540 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.006027 +INFO: TimeDuration, Event = Add_end, Time = 0.000486 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.006052 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.006500 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.014643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.015110 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.015128 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.015566 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.015580 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.019121 +INFO: TimeDuration, Event = Pool_end, Time = 0.003542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.030609 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.030870 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.030889 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.031118 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.038136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.038389 +INFO: TimeDuration, Event = Add_end, Time = 0.000253 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.038403 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.038634 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.038647 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.041431 +INFO: TimeDuration, Event = Pool_end, Time = 0.002784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.046923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.047093 +INFO: TimeDuration, Event = Add_end, Time = 0.000170 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.047107 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.047234 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.051101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.051262 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.051276 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.051403 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.051416 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.054333 +INFO: TimeDuration, Event = Pool_end, Time = 0.002917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.054355 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.054428 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.054443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.054465 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.054479 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.054523 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.657028, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.075693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.076164 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.076180 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.076624 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.084293 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.084805 +INFO: TimeDuration, Event = Add_end, Time = 0.000512 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.084824 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.085262 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.085276 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.088774 +INFO: TimeDuration, Event = Pool_end, Time = 0.003498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.100252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.100570 +INFO: TimeDuration, Event = Add_end, Time = 0.000317 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.100591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.100820 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.107786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.108038 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.108052 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.108283 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.108296 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.111082 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.116494 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.116657 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.116671 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.116797 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.120653 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.120815 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.120829 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.120956 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.120968 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.123887 +INFO: TimeDuration, Event = Pool_end, Time = 0.002918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.123907 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.123980 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.123994 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.124016 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.124030 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.124076 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.324629, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.144907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.145379 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.145396 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.145841 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000446 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.153615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.154098 +INFO: TimeDuration, Event = Add_end, Time = 0.000483 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.154114 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.154552 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.154568 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.158116 +INFO: TimeDuration, Event = Pool_end, Time = 0.003548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.169597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.169860 +INFO: TimeDuration, Event = Add_end, Time = 0.000263 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.169878 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.170109 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.177121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.177378 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.177392 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.177626 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.177640 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.180417 +INFO: TimeDuration, Event = Pool_end, Time = 0.002776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.185963 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.186132 +INFO: TimeDuration, Event = Add_end, Time = 0.000169 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.186147 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.186275 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.190121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.190283 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.190297 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.190423 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.190436 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.193358 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.193379 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.193458 +INFO: TimeDuration, Event = Mul_end, Time = 0.000079 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.193473 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.193497 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.193512 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.193581 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000069 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.104661, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.214620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.215094 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.215111 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.215554 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.223205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.223671 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.223687 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.224123 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.224137 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.227681 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.242328 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.242599 +INFO: TimeDuration, Event = Add_end, Time = 0.000271 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.242619 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.242850 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.249097 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.249351 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.249366 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.249595 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.249608 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.252394 +INFO: TimeDuration, Event = Pool_end, Time = 0.002786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.258035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.258201 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.258216 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.258345 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.262212 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.262374 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.262388 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.262515 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.262527 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.265447 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.265468 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.265610 +INFO: TimeDuration, Event = Mul_end, Time = 0.000142 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.265625 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.265647 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.265662 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.265708 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 52.220574, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.286710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.287182 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.287209 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.287650 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.295356 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.295822 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.295838 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.296276 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.296288 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.299818 +INFO: TimeDuration, Event = Pool_end, Time = 0.003529 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.311288 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.311547 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.311565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.311802 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.318795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.319046 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.319060 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.319288 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.319320 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.322090 +INFO: TimeDuration, Event = Pool_end, Time = 0.002770 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.327506 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.327668 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.327682 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.327807 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.331636 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.331796 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.331809 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.331934 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.331946 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.334869 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.334889 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.334962 +INFO: TimeDuration, Event = Mul_end, Time = 0.000074 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.334976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.334999 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.335012 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.335057 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.294206, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.356098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.356573 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.356695 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.357134 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.364685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.365151 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.365167 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.365606 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.365621 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.369163 +INFO: TimeDuration, Event = Pool_end, Time = 0.003542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.380630 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.380892 +INFO: TimeDuration, Event = Add_end, Time = 0.000262 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.380911 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.381141 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.388149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.388402 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.388412 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.388642 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.388655 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.391443 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.396842 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.397006 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.397020 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.397146 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.400994 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.401156 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.401169 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.401295 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.401310 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.404229 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.404248 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.404321 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.404482 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.404507 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.404522 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.404567 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000046 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.255913, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.425065 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.425535 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.425552 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.425990 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.433650 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.434114 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.434131 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.434564 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.434577 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.438130 +INFO: TimeDuration, Event = Pool_end, Time = 0.003553 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.451650 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.451919 +INFO: TimeDuration, Event = Add_end, Time = 0.000270 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.451938 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.452175 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.459295 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.459547 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.459562 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.459793 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.459805 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.462596 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.468014 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.468176 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.468189 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.468328 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.472160 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.472562 +INFO: TimeDuration, Event = Add_end, Time = 0.000402 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.472580 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.472704 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.472718 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.475395 +INFO: TimeDuration, Event = Pool_end, Time = 0.002677 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.475415 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.475487 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.475500 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.475528 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.475542 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.475594 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 52.441887, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.496188 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.496664 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.496679 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.497119 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.504749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.505215 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.505232 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.505668 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.505683 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.509226 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.520690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.520950 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.520968 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.521198 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.528185 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.528436 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.528479 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.528709 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.528721 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.531480 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.536881 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.537044 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.537058 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.537185 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.541045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.541207 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.541220 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.541345 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.541358 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.544281 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.544301 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.544381 +INFO: TimeDuration, Event = Mul_end, Time = 0.000080 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.544478 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.544502 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.544517 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.544562 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.185359, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.565737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.566210 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.566227 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.566666 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.574376 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.574839 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.574855 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.575293 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.575307 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.578856 +INFO: TimeDuration, Event = Pool_end, Time = 0.003550 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.590317 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.590578 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.590595 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.590826 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.597815 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.598066 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.598080 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.598310 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.598323 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.601110 +INFO: TimeDuration, Event = Pool_end, Time = 0.002787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.606552 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.606715 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.606729 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.606856 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.610690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.610852 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.610866 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.610991 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.611004 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.613940 +INFO: TimeDuration, Event = Pool_end, Time = 0.002936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.613960 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.614032 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.614046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.614068 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.614082 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.614127 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.393541, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.635432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.635901 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.635918 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.636357 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.644232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.644700 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.644718 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.645156 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.645169 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.648717 +INFO: TimeDuration, Event = Pool_end, Time = 0.003548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.660172 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.660432 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.660536 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.660765 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.667958 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.668209 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.668222 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.668457 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.668494 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.671257 +INFO: TimeDuration, Event = Pool_end, Time = 0.002763 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.676653 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.676817 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.676831 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.676959 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.680821 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.680983 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.680996 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.681123 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.681136 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.684055 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.684074 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.684146 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.684160 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.684183 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.684197 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.684241 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.632287, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.705081 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.705549 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.705564 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.706004 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.713670 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.714136 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.714152 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.714588 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.714603 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.718133 +INFO: TimeDuration, Event = Pool_end, Time = 0.003530 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.729599 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.729858 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.729876 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.730106 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.737092 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.737344 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.737357 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.737586 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.737598 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.740389 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.746156 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.746336 +INFO: TimeDuration, Event = Add_end, Time = 0.000180 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.746351 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.746478 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.750321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.750481 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.750495 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.750621 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.750633 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.753556 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.753576 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.753649 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.753663 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.753685 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.753699 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.753743 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.415207, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.774858 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.775327 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.775343 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.775784 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.783429 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.783892 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.783909 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.784346 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.784357 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.787908 +INFO: TimeDuration, Event = Pool_end, Time = 0.003551 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.799361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.799620 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.799637 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.799867 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.806836 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.807087 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.807100 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.807331 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.807343 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.810132 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.815570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.815734 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.815747 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.815874 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.819731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.819895 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.819908 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.820034 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.820047 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.822962 +INFO: TimeDuration, Event = Pool_end, Time = 0.002915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.823020 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.823092 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.823107 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.823128 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.823142 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.823187 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.243111, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.844016 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.844485 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.844499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.844943 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.852544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.853006 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.853023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.853464 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.853478 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.857023 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.868490 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.868752 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.868769 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.868998 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.875999 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.876255 +INFO: TimeDuration, Event = Add_end, Time = 0.000255 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.876267 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.876497 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.876510 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.879292 +INFO: TimeDuration, Event = Pool_end, Time = 0.002782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.884699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.884861 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.884875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.885000 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.888868 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.889029 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.889043 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.889169 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.889180 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.892102 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.892121 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.892194 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.892208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.892230 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.892245 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.892289 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.193689, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.912871 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.913341 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.913356 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.913800 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.921421 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.921888 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.921904 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.922340 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.922354 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.925896 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.937361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.937620 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.937637 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.937868 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.945004 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.945258 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.945272 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.945503 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.945516 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.948300 +INFO: TimeDuration, Event = Pool_end, Time = 0.002784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.953960 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.954125 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.954139 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.954267 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.958221 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.958384 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.958397 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.958523 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.958535 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.961457 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352777.961477 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352777.961550 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.961565 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.961587 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352777.961602 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352777.961646 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.704457, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.982577 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.983058 +INFO: TimeDuration, Event = Add_end, Time = 0.000481 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.983074 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.983513 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352777.991128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352777.991591 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352777.991606 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352777.992046 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352777.992060 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352777.995607 +INFO: TimeDuration, Event = Pool_end, Time = 0.003546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.007071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.007329 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.007348 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.007579 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.014564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.014816 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.014829 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.015096 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000267 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.015110 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.017863 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.023253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.023416 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.023431 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.023557 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.027394 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.027555 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.027571 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.027697 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.027710 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.030626 +INFO: TimeDuration, Event = Pool_end, Time = 0.002916 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.030646 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.030719 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.030732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.030754 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.030768 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.030813 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.462858, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.052028 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.052499 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.052513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.052955 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.060576 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.061043 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.061059 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.061495 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.061509 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.065055 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.076535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.076795 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.076812 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.077045 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.084039 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.084291 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.084311 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.084543 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.084556 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.087337 +INFO: TimeDuration, Event = Pool_end, Time = 0.002782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.092735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.092898 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.092912 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.093038 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.096890 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.097053 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.097066 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.097193 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.097205 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.100139 +INFO: TimeDuration, Event = Pool_end, Time = 0.002934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.100158 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.100231 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.100245 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.100267 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.100281 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.100332 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.229369, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.121039 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.121507 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.121525 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.121968 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.129550 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.130013 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.130030 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.130468 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.130483 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.134019 +INFO: TimeDuration, Event = Pool_end, Time = 0.003536 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.146431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.146697 +INFO: TimeDuration, Event = Add_end, Time = 0.000266 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.146716 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.146947 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.153893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.154144 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.154157 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.154386 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.154398 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.157211 +INFO: TimeDuration, Event = Pool_end, Time = 0.002813 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.162589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.162751 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.162764 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.162889 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.166710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.166870 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.166883 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.167010 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.167021 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.169943 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.169963 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.170039 +INFO: TimeDuration, Event = Mul_end, Time = 0.000076 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.170053 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.170077 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.170091 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.170142 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.013250, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.193684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.194160 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.194175 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.194618 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.202324 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.202789 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.202805 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.203240 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.203252 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.206804 +INFO: TimeDuration, Event = Pool_end, Time = 0.003551 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.218249 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.218508 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.218527 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.218757 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.225715 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.225966 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.225979 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.226212 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.226224 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.229027 +INFO: TimeDuration, Event = Pool_end, Time = 0.002803 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.234433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.234596 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.234608 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.234733 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.238580 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.238741 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.238754 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.238881 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.238893 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.241814 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.241834 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.241907 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.241921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.241945 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.241959 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.242004 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.223112, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.263067 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.263536 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.263549 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.263990 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.271537 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.272001 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.272016 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.272455 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.272613 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.276016 +INFO: TimeDuration, Event = Pool_end, Time = 0.003403 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.287444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.287702 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.287719 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.287949 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.294870 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.295121 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.295133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.295362 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.295374 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.298166 +INFO: TimeDuration, Event = Pool_end, Time = 0.002793 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.303542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.303708 +INFO: TimeDuration, Event = Add_end, Time = 0.000165 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.303720 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.303846 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.307639 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.307799 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.307811 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.307937 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.307948 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.310872 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.310890 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.310961 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.310975 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.310997 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.311010 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.311053 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.722524, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.332037 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.332510 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.332703 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.333145 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.340522 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.340989 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.341006 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.341440 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.341454 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.344997 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.356428 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.356688 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.356705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.356933 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.363894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.364146 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.364158 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.364389 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.364481 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.367194 +INFO: TimeDuration, Event = Pool_end, Time = 0.002712 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.372609 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.372772 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.372785 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.372912 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.376716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.376877 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.376891 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.377015 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.377026 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.379950 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.380008 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.380080 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.380093 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.380114 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.380128 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.380172 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.763666, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.400886 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.401354 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.401370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.401808 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.409354 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.409823 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.409839 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.410276 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.410288 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.413834 +INFO: TimeDuration, Event = Pool_end, Time = 0.003546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.425263 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.425521 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.425538 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.425770 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.432701 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.432952 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.432966 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.433198 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.433210 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.435992 +INFO: TimeDuration, Event = Pool_end, Time = 0.002782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.441367 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.441530 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.441542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.441667 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.445462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.445622 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.445635 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.445760 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.445773 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.448698 +INFO: TimeDuration, Event = Pool_end, Time = 0.002925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.448718 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.448788 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.448802 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.448823 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.448836 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.448879 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.879827, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.469739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.470210 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.470226 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.470672 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000446 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.478187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.478650 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.478664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.479102 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.479116 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.482664 +INFO: TimeDuration, Event = Pool_end, Time = 0.003548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.494094 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.494353 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.494370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.494601 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.501536 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.501788 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.501802 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.502032 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.502042 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.504830 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.510219 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.510381 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.510394 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.510520 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.514340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.514502 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.514513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.514639 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.514650 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.517572 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.517592 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.517662 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.517676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.517697 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.517710 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.517754 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.194578, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.539156 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.539628 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.539643 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.540085 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.547862 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.548329 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.548535 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.548984 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000449 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.548999 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.552634 +INFO: TimeDuration, Event = Pool_end, Time = 0.003635 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.563886 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.564149 +INFO: TimeDuration, Event = Add_end, Time = 0.000263 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.564166 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.564400 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.571597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.571933 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.571946 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.572176 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.572190 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.574889 +INFO: TimeDuration, Event = Pool_end, Time = 0.002699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.580333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.580503 +INFO: TimeDuration, Event = Add_end, Time = 0.000170 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.580516 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.580644 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.584754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.584918 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.584932 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.585057 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.585071 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.587983 +INFO: TimeDuration, Event = Pool_end, Time = 0.002913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.588002 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.588084 +INFO: TimeDuration, Event = Mul_end, Time = 0.000081 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.588111 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.588133 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.588147 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.588191 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.989680, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.609484 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.609958 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.609974 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.610413 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.617864 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.618331 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.618347 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.618810 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000463 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.618825 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.622340 +INFO: TimeDuration, Event = Pool_end, Time = 0.003515 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.633804 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.634064 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.634081 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.634310 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.641249 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.641501 +INFO: TimeDuration, Event = Add_end, Time = 0.000253 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.641514 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.641743 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.641754 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.644543 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.649924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.650086 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.650099 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.650223 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.654020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.654183 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.654196 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.654321 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.654333 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.657256 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.657275 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.657347 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.657361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.657383 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.657397 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.657441 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.172893, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.678240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.678714 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.678731 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.679169 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.686683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.687144 +INFO: TimeDuration, Event = Add_end, Time = 0.000461 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.687159 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.687601 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.687614 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.691164 +INFO: TimeDuration, Event = Pool_end, Time = 0.003550 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.702596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.702856 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.702872 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.703100 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.710032 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.710282 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.710294 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.710523 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.710535 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.713331 +INFO: TimeDuration, Event = Pool_end, Time = 0.002796 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.718703 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.718866 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.718878 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.719009 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.722812 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.722972 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.722985 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.723111 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.723123 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.726045 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.726068 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.726140 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.726153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.726175 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.726189 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.726233 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.288775, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.746931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.747398 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.747413 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.747854 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.755406 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.755869 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.755884 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.756321 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.756660 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.759881 +INFO: TimeDuration, Event = Pool_end, Time = 0.003220 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.771307 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.771565 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.771582 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.771811 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.778748 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.778998 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.779011 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.779240 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.779251 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.782045 +INFO: TimeDuration, Event = Pool_end, Time = 0.002793 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.787450 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.787612 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.787624 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.787750 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.791543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.791703 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.791716 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.791840 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.791851 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.794777 +INFO: TimeDuration, Event = Pool_end, Time = 0.002926 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.794796 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.794869 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.794883 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.794904 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.794917 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.794961 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.561406, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.815440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.815912 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.815926 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.816367 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.823920 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.824386 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.824508 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.824941 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.824954 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.828737 +INFO: TimeDuration, Event = Pool_end, Time = 0.003783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.839838 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.840102 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.840121 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.840552 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.847263 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.847515 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.847528 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.847758 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.847770 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.850562 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.855937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.856100 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.856112 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.856236 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.860036 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.860195 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.860208 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.860333 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.860343 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.863273 +INFO: TimeDuration, Event = Pool_end, Time = 0.002930 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.863292 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.863362 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.863375 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.863400 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.863415 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.863463 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000048 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.463555, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.883800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.884271 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.884285 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.884726 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.892251 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.892716 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.892733 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.893172 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.893185 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.896728 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.908162 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.908422 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.908437 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.908667 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.915588 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.915839 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.915852 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.916082 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.916094 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.918886 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.924254 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.924417 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.924477 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.924602 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.928688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.928851 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.928864 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.928989 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.929002 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.931914 +INFO: TimeDuration, Event = Pool_end, Time = 0.002912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352778.931933 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352778.932005 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.932019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.932040 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352778.932053 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352778.932097 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.132401, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.953035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.953506 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.953522 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.953962 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.961483 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.961948 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.961962 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.962397 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.962410 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.965964 +INFO: TimeDuration, Event = Pool_end, Time = 0.003553 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.977406 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.977666 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.977682 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.977915 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.984872 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.985123 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.985136 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.985366 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.985378 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352778.988170 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.993553 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.993716 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.993727 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.993853 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352778.997656 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352778.997818 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352778.997831 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352778.997956 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352778.997968 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.000894 +INFO: TimeDuration, Event = Pool_end, Time = 0.002926 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.000913 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.000983 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.000997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.001018 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.001031 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.001074 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.934594, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.021846 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.022316 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.022332 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.022776 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.030267 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.030734 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.030749 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.031188 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.031202 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.034746 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.046180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.046438 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.046454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.046685 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.053618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.053868 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.053881 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.054111 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.054122 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.056914 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.062285 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.062448 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.062462 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.062587 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.066411 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.066572 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.066585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.066710 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.066722 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.069643 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.069662 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.069733 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.069746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.069767 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.069781 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.069826 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.022579, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.090641 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.091113 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.091128 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.091564 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.099143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.099610 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.099625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.100063 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.100076 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.103621 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.115050 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.115309 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.115326 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.115557 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.122540 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.122791 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.122803 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.123035 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.123048 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.125836 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.131245 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.131407 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.131420 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.131546 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.135334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.135493 +INFO: TimeDuration, Event = Add_end, Time = 0.000159 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.135505 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.135632 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.135643 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.138568 +INFO: TimeDuration, Event = Pool_end, Time = 0.002925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.138587 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.138657 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.138670 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.138692 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.138705 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.138749 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.048527, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.159617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.160088 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.160102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.160543 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.168084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.168546 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.168563 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.169000 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.169014 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.172560 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.183988 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.184245 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.184261 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.184493 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.191422 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.191674 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.191686 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.191916 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.191928 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.194717 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.200088 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.200250 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.200262 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.200389 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.204180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.204340 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.204458 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.204581 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.204592 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.207414 +INFO: TimeDuration, Event = Pool_end, Time = 0.002822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.207434 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.207505 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.207519 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.207540 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.207554 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.207598 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.840224, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.228043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.228544 +INFO: TimeDuration, Event = Add_end, Time = 0.000501 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.228742 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.229182 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.236528 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.236990 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.237006 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.237441 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.237455 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.241004 +INFO: TimeDuration, Event = Pool_end, Time = 0.003549 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.253349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.253608 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.253625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.253854 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.260819 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.261071 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.261083 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.261313 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.261325 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.264100 +INFO: TimeDuration, Event = Pool_end, Time = 0.002775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.269479 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.269640 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.269654 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.269778 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.273570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.273731 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.273742 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.273869 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.273880 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.276803 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.276823 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.276893 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.276907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.276929 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.276942 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.276986 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.662651, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.297216 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.297686 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.297702 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.298145 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.305693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.306159 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.306173 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.306610 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.306622 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.310157 +INFO: TimeDuration, Event = Pool_end, Time = 0.003535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.321591 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.321850 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.321867 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.322100 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.329033 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.329284 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.329297 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.329530 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.329542 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.332331 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.337715 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.337880 +INFO: TimeDuration, Event = Add_end, Time = 0.000165 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.337893 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.338020 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.341801 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.341961 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.341974 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.342099 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.342110 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.345033 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.345053 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.345122 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.345140 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.345161 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.345175 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.345219 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.897658, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.366205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.366679 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.366694 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.367136 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.374637 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.375100 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.375116 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.375554 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.375567 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.379119 +INFO: TimeDuration, Event = Pool_end, Time = 0.003552 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.390542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.390802 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.390820 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.391049 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.398001 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.398252 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.398264 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.398496 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.398507 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.401296 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.406664 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.406827 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.406839 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.406965 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.410773 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.410933 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.410947 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.411075 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.411087 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.414006 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.414025 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.414096 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.414111 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.414133 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.414146 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.414190 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.214562, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.434805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.435275 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.435289 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.435729 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.443281 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.443746 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.443764 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.444201 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.444213 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.447763 +INFO: TimeDuration, Event = Pool_end, Time = 0.003549 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.459187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.459446 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.459463 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.459693 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.466623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.466874 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.466887 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.467116 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.467127 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.469918 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.475294 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.475456 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.475469 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.475595 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.479391 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.479551 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.479563 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.479689 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.479738 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.482627 +INFO: TimeDuration, Event = Pool_end, Time = 0.002889 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.482646 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.482717 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.482730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.482751 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.482764 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.482809 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.889720, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.503886 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.504353 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.504365 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.504810 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.512388 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.512853 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.512870 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.513309 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.513321 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.516869 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.528299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.528564 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.528581 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.528811 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.535740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.535990 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.536003 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.536233 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.536245 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.539038 +INFO: TimeDuration, Event = Pool_end, Time = 0.002793 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.544411 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.544573 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.544586 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.544710 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.548870 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.549032 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.549045 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.549171 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.549184 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.552100 +INFO: TimeDuration, Event = Pool_end, Time = 0.002917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.552120 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.552193 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.552206 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.552227 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.552241 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.552285 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.282557, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.572737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.573206 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.573221 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.573664 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.581191 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.581659 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.581673 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.582110 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.582123 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.585667 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.597096 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.597356 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.597372 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.597605 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.604542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.604800 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.604813 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.605043 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.605055 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.607834 +INFO: TimeDuration, Event = Pool_end, Time = 0.002780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.613207 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.613370 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.613383 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.613513 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.617306 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.617467 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.617480 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.617605 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.617617 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.620542 +INFO: TimeDuration, Event = Pool_end, Time = 0.002925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.620562 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.620633 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.620646 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.620667 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.620681 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.620724 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.875646, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.642612 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.643093 +INFO: TimeDuration, Event = Add_end, Time = 0.000481 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.643113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.643565 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.651745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.652208 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.652225 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.652665 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.652683 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.656224 +INFO: TimeDuration, Event = Pool_end, Time = 0.003541 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.667622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.667880 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.667896 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.668124 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.675093 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.675343 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.675355 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.675584 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.675593 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.678383 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.683800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.683961 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.683973 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.684098 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.687913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.688074 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.688086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.688219 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000132 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.688230 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.691146 +INFO: TimeDuration, Event = Pool_end, Time = 0.002917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.691165 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.691233 +INFO: TimeDuration, Event = Mul_end, Time = 0.000069 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.691246 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.691267 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.691280 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.691323 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.388619, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.712720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.713195 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.713210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.713648 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.721076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.721545 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.721560 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.721997 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.722010 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.725552 +INFO: TimeDuration, Event = Pool_end, Time = 0.003542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.737010 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.737267 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.737285 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.737514 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.744477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.744727 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.744741 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.744972 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.744983 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.747769 +INFO: TimeDuration, Event = Pool_end, Time = 0.002786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.753187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.753348 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.753362 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.753489 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.757282 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.757443 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.757456 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.757582 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.757594 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.760516 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.760535 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.760606 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.760619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.760640 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.760653 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.760698 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.896637, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.781387 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.781861 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.781878 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.782322 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.789692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.790157 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.790171 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.790606 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.790620 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.794171 +INFO: TimeDuration, Event = Pool_end, Time = 0.003551 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.805600 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.805860 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.805876 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.806106 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.813084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.813334 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.813348 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.813579 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.813591 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.816378 +INFO: TimeDuration, Event = Pool_end, Time = 0.002787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.822101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.822264 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.822277 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.822403 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.826238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.826400 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.826412 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.826537 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.826549 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.829473 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.829492 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.829563 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.829577 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.829598 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.829615 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.829659 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.161368, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.850644 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.851118 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.851169 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.851608 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.859082 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.859547 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.859562 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.859997 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.860010 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.863562 +INFO: TimeDuration, Event = Pool_end, Time = 0.003551 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.875002 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.875262 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.875281 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.875510 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.882439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.882691 +INFO: TimeDuration, Event = Add_end, Time = 0.000253 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.882705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.882936 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.882948 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.885732 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.891104 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.891266 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.891278 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.891404 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.895228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.895390 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.895402 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.895526 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.895538 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.898457 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.898477 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.898548 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.898561 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.898582 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.898597 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.898640 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.241734, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.919368 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.919839 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.919856 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.920293 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.927777 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.928241 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.928256 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.928695 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.928711 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.932258 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.943682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.943940 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.943956 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.944187 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.951150 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.951401 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.951412 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.951642 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.951653 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.954444 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.959813 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.959974 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.959986 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.960111 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.963908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.964068 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.964082 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.964208 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.964219 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352779.967141 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352779.967159 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352779.967227 +INFO: TimeDuration, Event = Mul_end, Time = 0.000068 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.967240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.967261 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352779.967276 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352779.967318 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.836951, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.988336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.988809 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.988825 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.989267 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352779.996896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352779.997362 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352779.997377 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352779.997815 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352779.997841 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.001368 +INFO: TimeDuration, Event = Pool_end, Time = 0.003526 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.012784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.013043 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.013061 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.013291 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.020257 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.020507 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.020521 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.020751 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.020764 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.023549 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.028963 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.029124 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.029137 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.029265 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.033071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.033232 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.033289 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.033414 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.033427 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.036311 +INFO: TimeDuration, Event = Pool_end, Time = 0.002884 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.036495 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.036567 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.036580 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.036603 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.036617 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.036661 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000045 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.098943, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.057061 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.057533 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.057548 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.057992 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.065512 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.065978 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.065993 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.066433 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.066447 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.069990 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.081422 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.081682 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.081697 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.081930 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.088893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.089143 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.089155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.089383 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.089395 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.092187 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.097631 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.097794 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.097807 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.097933 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.101737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.101897 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.101909 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.102034 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.102046 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.104969 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.104989 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.105059 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.105072 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.105093 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.105106 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.105151 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.987597, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.125822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.126292 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.126307 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.126745 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.134265 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.134729 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.134748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.135181 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.135198 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.138749 +INFO: TimeDuration, Event = Pool_end, Time = 0.003551 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.150174 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.150433 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.150450 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.150681 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.157611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.157861 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.157874 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.158103 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.158115 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.160906 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.166320 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.166483 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.166496 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.166621 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.170448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.170608 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.170620 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.170746 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.170758 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.173828 +INFO: TimeDuration, Event = Pool_end, Time = 0.003070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.176801 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.177169 +INFO: TimeDuration, Event = Mul_end, Time = 0.000367 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.177183 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.177252 +INFO: TimeDuration, Event = Add_end, Time = 0.000069 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.177268 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.177355 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.194505, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.198256 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.198744 +INFO: TimeDuration, Event = Add_end, Time = 0.000488 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.198761 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.199206 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.206860 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.207322 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.207337 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.207773 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.207789 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.211363 +INFO: TimeDuration, Event = Pool_end, Time = 0.003573 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.222776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.223035 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.223053 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.223282 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.230212 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.230463 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.230476 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.230706 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.230717 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.233508 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.238894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.239056 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.239070 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.239195 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.243009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.243169 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.243181 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.243306 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.243318 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.246241 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.246261 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.246332 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.246345 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.246367 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.246380 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.246423 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.376974, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.271741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.272276 +INFO: TimeDuration, Event = Add_end, Time = 0.000535 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.272682 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.273144 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000461 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.282762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.283252 +INFO: TimeDuration, Event = Add_end, Time = 0.000490 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.283287 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.283744 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000457 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.283775 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.287206 +INFO: TimeDuration, Event = Pool_end, Time = 0.003431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.298792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.299052 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.299074 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.299311 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.306498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.306758 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.306776 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.307009 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.307024 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.309793 +INFO: TimeDuration, Event = Pool_end, Time = 0.002769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.315299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.315466 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.315483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.315613 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.319588 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.319755 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.319771 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.319901 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.319915 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.322824 +INFO: TimeDuration, Event = Pool_end, Time = 0.002908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.322850 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.322930 +INFO: TimeDuration, Event = Mul_end, Time = 0.000080 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.322947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.322972 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.322989 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.323040 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 56.623812, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.345490 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.345964 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.345981 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.346422 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.354132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.354598 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.354614 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.355050 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.355065 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.358628 +INFO: TimeDuration, Event = Pool_end, Time = 0.003564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.370137 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.370400 +INFO: TimeDuration, Event = Add_end, Time = 0.000263 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.370419 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.370650 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.377649 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.377904 +INFO: TimeDuration, Event = Add_end, Time = 0.000255 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.377918 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.378150 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.378164 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.380955 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.386392 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.386557 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.386571 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.386698 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.390566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.390728 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.390741 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.390867 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.390881 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.393800 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.393820 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.393894 +INFO: TimeDuration, Event = Mul_end, Time = 0.000074 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.393909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.393931 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.393951 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.393998 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000047 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.523118, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.418587 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.419108 +INFO: TimeDuration, Event = Add_end, Time = 0.000521 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.419128 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.419593 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.427843 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.428318 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.428344 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.428793 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000449 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.428816 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.432328 +INFO: TimeDuration, Event = Pool_end, Time = 0.003511 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.443914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.444179 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.444200 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.444437 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.451614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.451868 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.451884 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.452118 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.452133 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.454917 +INFO: TimeDuration, Event = Pool_end, Time = 0.002784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.460507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.460682 +INFO: TimeDuration, Event = Add_end, Time = 0.000175 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.460701 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.460833 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.464780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.464946 +INFO: TimeDuration, Event = Add_end, Time = 0.000166 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.464962 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.465092 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.465107 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.468018 +INFO: TimeDuration, Event = Pool_end, Time = 0.002912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.468042 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.468122 +INFO: TimeDuration, Event = Mul_end, Time = 0.000080 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.468139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.468165 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.468182 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.468234 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 53.965671, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.490459 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.490938 +INFO: TimeDuration, Event = Add_end, Time = 0.000479 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.490958 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.491404 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000446 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.499386 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.499857 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.499879 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.500320 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.500477 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.503868 +INFO: TimeDuration, Event = Pool_end, Time = 0.003391 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.515461 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.515726 +INFO: TimeDuration, Event = Add_end, Time = 0.000265 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.515747 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.515982 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.523155 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.523411 +INFO: TimeDuration, Event = Add_end, Time = 0.000256 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.523427 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.523661 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.523676 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.526455 +INFO: TimeDuration, Event = Pool_end, Time = 0.002779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.531996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.532165 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.532181 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.532314 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.536313 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.536480 +INFO: TimeDuration, Event = Add_end, Time = 0.000167 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.536499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.536628 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000130 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.536643 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.539547 +INFO: TimeDuration, Event = Pool_end, Time = 0.002904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.539571 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.539650 +INFO: TimeDuration, Event = Mul_end, Time = 0.000079 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.539667 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.539693 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.539710 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.539761 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.674588, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.561781 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.562265 +INFO: TimeDuration, Event = Add_end, Time = 0.000483 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.562286 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.562731 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.570859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.571326 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.571347 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.571783 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.571802 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.575338 +INFO: TimeDuration, Event = Pool_end, Time = 0.003536 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.586930 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.587193 +INFO: TimeDuration, Event = Add_end, Time = 0.000263 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.587214 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.587448 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.594617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.594872 +INFO: TimeDuration, Event = Add_end, Time = 0.000256 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.594889 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.595122 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.595136 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.597914 +INFO: TimeDuration, Event = Pool_end, Time = 0.002778 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.606449 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.606630 +INFO: TimeDuration, Event = Add_end, Time = 0.000181 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.606649 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.606789 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000141 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.610734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.610947 +INFO: TimeDuration, Event = Add_end, Time = 0.000213 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.610965 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.611094 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.611109 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.613973 +INFO: TimeDuration, Event = Pool_end, Time = 0.002865 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.613995 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.614072 +INFO: TimeDuration, Event = Mul_end, Time = 0.000078 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.614087 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.614114 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.614129 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.614183 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000053 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 54.480835, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.634978 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.635457 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.635472 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.635912 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.643452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.643916 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.643932 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.644371 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.644382 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.647931 +INFO: TimeDuration, Event = Pool_end, Time = 0.003549 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.659355 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.659613 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.659629 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.659860 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.666797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.667047 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.667060 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.667291 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.667303 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.670092 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.675502 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.675664 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.675676 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.675804 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.679597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.679762 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.679774 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.679898 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.679909 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.682831 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.682850 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.682920 +INFO: TimeDuration, Event = Mul_end, Time = 0.000069 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.682933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.682954 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.682968 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.683017 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.925242, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.703579 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.704047 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.704063 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.704502 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.712074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.712538 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.712552 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.712991 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.713003 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.716547 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.727974 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.728233 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.728250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.728481 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.735415 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.735666 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.735679 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.735908 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.735920 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.738708 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.744071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.744232 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.744245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.744372 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.748180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.748538 +INFO: TimeDuration, Event = Add_end, Time = 0.000358 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.748554 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.748677 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.748689 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.751415 +INFO: TimeDuration, Event = Pool_end, Time = 0.002727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.751434 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.751505 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.751518 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.751540 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.751553 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.751604 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.853317, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.772105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.772583 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.772595 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.773036 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.780575 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.781042 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.781060 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.781494 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.781510 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.785053 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.796501 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.796761 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.796778 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.797008 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.803959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.804209 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.804222 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.804452 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.804461 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.807248 +INFO: TimeDuration, Event = Pool_end, Time = 0.002787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.812751 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.812916 +INFO: TimeDuration, Event = Add_end, Time = 0.000165 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.812929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.813055 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.816860 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.817021 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.817033 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.817159 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.817171 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.820094 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.820112 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.820182 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.820194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.820215 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.820228 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.820277 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.030344, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.840859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.841331 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.841347 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.841787 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.849326 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.849794 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.849810 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.850248 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.850261 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.853800 +INFO: TimeDuration, Event = Pool_end, Time = 0.003539 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.865235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.865494 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.865510 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.865741 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.872678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.872927 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.872941 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.873169 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.873181 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.875971 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.881346 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.881509 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.881521 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.881649 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.885427 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.885587 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.885599 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.885724 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.885736 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.888659 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.888678 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.888749 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.888762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.888783 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.888797 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.888846 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.859689, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.909338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.909806 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.909823 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.910261 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.917993 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.918464 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.918479 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.918918 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.918931 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.922469 +INFO: TimeDuration, Event = Pool_end, Time = 0.003538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.933900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.934157 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.934174 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.934405 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.941355 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.941607 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.941626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.941856 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.941868 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.944646 +INFO: TimeDuration, Event = Pool_end, Time = 0.002779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.950083 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.950245 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.950258 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.950382 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.954194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.954354 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.954366 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.954492 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.954504 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.957428 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352780.957447 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352780.957516 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.957530 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.957551 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352780.957565 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352780.957609 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.140633, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.978336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.978807 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.978822 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.979264 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352780.986800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352780.987262 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352780.987277 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352780.987724 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000447 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352780.987738 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352780.991282 +INFO: TimeDuration, Event = Pool_end, Time = 0.003544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.002711 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.002969 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.002985 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.003215 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.010159 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.010409 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.010422 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.010653 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.010664 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.013453 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.018836 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.018996 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.019009 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.019135 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.024928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.025097 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.025111 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.025239 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.025250 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.028162 +INFO: TimeDuration, Event = Pool_end, Time = 0.002912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.028182 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.028270 +INFO: TimeDuration, Event = Mul_end, Time = 0.000088 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.028283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.028326 +INFO: TimeDuration, Event = Add_end, Time = 0.000042 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.028480 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.028531 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 52.268981, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.052246 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.052719 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.052737 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.053181 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.060759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.061225 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.061242 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.061679 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.061692 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.065235 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.076670 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.076930 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.076948 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.077177 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.084147 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.084399 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.084483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.084711 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.084723 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.087442 +INFO: TimeDuration, Event = Pool_end, Time = 0.002719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.092828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.092991 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.093004 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.093129 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.096928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.097089 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.097102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.097229 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.097241 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.100162 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.100180 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.100252 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.100266 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.100286 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.100300 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.100356 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000056 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.926406, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.121205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.121675 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.121691 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.122130 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.129673 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.130136 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.130151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.130585 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.130598 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.134172 +INFO: TimeDuration, Event = Pool_end, Time = 0.003573 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.145577 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.145837 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.145854 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.146085 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.153036 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.153286 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.153299 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.153527 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.153540 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.156331 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.161709 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.161872 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.161885 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.162011 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.165829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.165989 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.166002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.166127 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.166139 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.169063 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.169081 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.169150 +INFO: TimeDuration, Event = Mul_end, Time = 0.000069 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.169164 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.169185 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.169198 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.169247 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.924441, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.190475 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.190941 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.190958 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.191399 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.198973 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.199439 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.199453 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.199890 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.199919 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.203455 +INFO: TimeDuration, Event = Pool_end, Time = 0.003536 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.214889 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.215146 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.215163 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.215394 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.222326 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.222575 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.222588 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.222818 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.222829 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.225622 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.231007 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.231168 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.231180 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.231305 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.235107 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.235268 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.235280 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.235406 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.235418 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.238341 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.238360 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.238430 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.238442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.238463 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.238478 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.238625 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000147 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.451014, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.260734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.261202 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.261219 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.261659 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.268918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.269382 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.269398 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.269832 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.269846 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.273396 +INFO: TimeDuration, Event = Pool_end, Time = 0.003550 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.284875 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.285135 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.285151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.285382 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.292329 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.292580 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.292594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.292824 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.292836 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.295602 +INFO: TimeDuration, Event = Pool_end, Time = 0.002766 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.300988 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.301152 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.301165 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.301291 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.305073 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.305233 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.305245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.305370 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.305382 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.308313 +INFO: TimeDuration, Event = Pool_end, Time = 0.002931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.308481 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.308553 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.308566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.308588 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.308601 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.308651 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.669377, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.328991 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.329456 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.329473 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.329911 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.337465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.337929 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.337945 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.338380 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.338394 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.341939 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.353381 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.353640 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.353657 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.353925 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000268 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.360837 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.361088 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.361100 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.361330 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.361342 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.364133 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.369527 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.369690 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.369702 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.369830 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.373663 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.373824 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.373837 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.373965 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.373990 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.376905 +INFO: TimeDuration, Event = Pool_end, Time = 0.002915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.376925 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.376998 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.377012 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.377033 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.377046 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.377090 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000043 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.978817, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.416795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.417269 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.417287 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.417732 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.425297 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.425760 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.425776 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.426214 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.426226 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.429774 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.441246 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.441506 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.441522 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.441755 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.448718 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.448968 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.448981 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.449213 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.449225 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.452014 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.457418 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.457581 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.457594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.457722 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.461543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.461704 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.461717 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.461842 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.461854 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.464784 +INFO: TimeDuration, Event = Pool_end, Time = 0.002931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.464804 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.464875 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.464889 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.464910 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.464924 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.464975 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.126807, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.485389 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.485859 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.485875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.486315 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.493902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.494366 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.494384 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.494819 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.494835 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.498380 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.509830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.510089 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.510106 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.510335 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.517299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.517551 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.517564 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.517795 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.517807 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.520592 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.525980 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.526143 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.526156 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.526281 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.530106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.530266 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.530279 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.530404 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.530417 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.533341 +INFO: TimeDuration, Event = Pool_end, Time = 0.002925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.533361 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.533433 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.533447 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.533468 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.533483 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.533526 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.053880, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.553746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.554223 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.554238 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.554675 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.562248 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.562715 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.562732 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.563169 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.563184 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.566726 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.578175 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.578435 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.578452 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.578682 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.585651 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.585903 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.585917 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.586148 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.586161 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.588945 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.594338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.594501 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.594515 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.594641 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.598484 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.598645 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.598658 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.598784 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.598797 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.601719 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.601739 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.601812 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.601825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.601846 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.601860 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.601912 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.155274, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.625675 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.626146 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.626163 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.626604 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.634131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.634597 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.634613 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.635051 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.635063 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.638609 +INFO: TimeDuration, Event = Pool_end, Time = 0.003546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.650041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.650301 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.650317 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.650550 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.657481 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.657731 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.657744 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.657974 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.657985 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.660777 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.666166 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.666328 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.666341 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.666466 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.670290 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.670450 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.670462 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.670586 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.670598 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.673525 +INFO: TimeDuration, Event = Pool_end, Time = 0.002927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.673544 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.673616 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.673629 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.673651 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.673664 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.673714 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.992529, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.694202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.694674 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.694689 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.695128 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.702652 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.703118 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.703132 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.703570 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.703585 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.707132 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.718571 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.718853 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.718871 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.719102 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.726015 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.726266 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.726278 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.726507 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.726519 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.729310 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.734682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.734845 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.734858 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.734987 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.738791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.738952 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.738965 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.739090 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.739101 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.742024 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.742042 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.742112 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.742127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.742149 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.742162 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.742212 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.136134, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.763058 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.763530 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.763545 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.763986 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.771558 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.772023 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.772041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.772479 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.772490 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.776034 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.787462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.787719 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.787735 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.787966 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.794894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.795146 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.795158 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.795388 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.795398 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.798190 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.803568 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.803729 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.803742 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.803868 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.807664 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.807824 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.807838 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.807964 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.807975 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.810896 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.810915 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.810986 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.810998 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.811019 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.811033 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.811083 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.906287, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.832048 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.832518 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.832712 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.833150 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.840540 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.841007 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.841022 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.841461 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.841475 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.845014 +INFO: TimeDuration, Event = Pool_end, Time = 0.003539 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.856449 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.856709 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.856725 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.856971 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000245 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.863908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.864158 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.864171 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.864401 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.864480 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.867200 +INFO: TimeDuration, Event = Pool_end, Time = 0.002721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.872585 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.872747 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.872760 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.872886 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.876698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.876857 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.876870 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.876995 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.877007 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.879932 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.879951 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.880021 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.880034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.880056 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.880069 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.880119 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.888758, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.900726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.901193 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.901210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.901654 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.909231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.909708 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.909724 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.910165 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.910179 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.913713 +INFO: TimeDuration, Event = Pool_end, Time = 0.003534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.925345 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.925643 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.925661 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.925892 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.932805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.933055 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.933068 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.933300 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.933312 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.936102 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.941485 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.941648 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.941661 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.941786 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.945662 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.945823 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.945834 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.945960 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.945971 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.948898 +INFO: TimeDuration, Event = Pool_end, Time = 0.002927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352781.948918 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352781.949003 +INFO: TimeDuration, Event = Mul_end, Time = 0.000085 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.949019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.949044 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352781.949057 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352781.949104 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000047 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.262645, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.970047 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.970514 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.970530 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.970968 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.978573 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.979038 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.979055 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.979491 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352781.979506 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352781.983049 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352781.994493 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352781.994755 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352781.994771 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352781.995003 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.001985 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.002236 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.002250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.002481 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.002493 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.005280 +INFO: TimeDuration, Event = Pool_end, Time = 0.002786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.010659 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.010822 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.010835 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.010973 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000138 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.014797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.014958 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.014971 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.015095 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.015108 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.018031 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.018050 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.018121 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.018135 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.018156 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.018170 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.018214 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.385773, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.039035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.039504 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.039521 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.039960 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.047554 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.048019 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.048037 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.048475 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.048486 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.052051 +INFO: TimeDuration, Event = Pool_end, Time = 0.003565 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.063472 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.063732 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.063748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.063978 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.070932 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.071183 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.071196 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.071424 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.071436 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.074228 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.079614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.079776 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.079789 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.079915 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.083806 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.083967 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.083981 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.084105 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.084117 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.087041 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.087060 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.087132 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.087146 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.087168 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.087183 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.087233 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.110609, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.107988 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.108458 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.108717 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.109158 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.116537 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.117004 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.117021 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.117464 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.117481 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.121017 +INFO: TimeDuration, Event = Pool_end, Time = 0.003535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.132489 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.132750 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.132767 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.132998 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.139980 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.140230 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.140243 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.140480 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000236 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.140492 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.143274 +INFO: TimeDuration, Event = Pool_end, Time = 0.002781 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.148704 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.148868 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.148880 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.149007 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.152859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.153020 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.153034 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.153161 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.153174 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.156094 +INFO: TimeDuration, Event = Pool_end, Time = 0.002920 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.156114 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.156186 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.156200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.156222 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.156235 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.156279 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.956923, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.176760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.177231 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.177247 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.177688 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.185265 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.185733 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.185749 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.186188 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.186201 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.189742 +INFO: TimeDuration, Event = Pool_end, Time = 0.003541 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.201211 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.201472 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.201488 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.201720 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.208693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.208947 +INFO: TimeDuration, Event = Add_end, Time = 0.000255 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.208961 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.209191 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.209203 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.211988 +INFO: TimeDuration, Event = Pool_end, Time = 0.002785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.217381 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.217543 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.217556 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.217684 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.221732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.221894 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.221906 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.222033 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.222046 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.224965 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.224985 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.225057 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.225070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.225092 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.225105 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.225154 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.307058, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.245752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.246224 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.246239 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.246680 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.255710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.256173 +INFO: TimeDuration, Event = Add_end, Time = 0.000463 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.256189 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.256626 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.256644 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.260189 +INFO: TimeDuration, Event = Pool_end, Time = 0.003545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.271621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.271881 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.271898 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.272127 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.279098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.279351 +INFO: TimeDuration, Event = Add_end, Time = 0.000253 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.279365 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.279595 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.279619 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.282395 +INFO: TimeDuration, Event = Pool_end, Time = 0.002776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.287780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.287943 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.287956 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.288081 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.291901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.292063 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.292076 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.292203 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.292215 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.295151 +INFO: TimeDuration, Event = Pool_end, Time = 0.002936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.295169 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.295239 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.295253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.295274 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.295288 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.295352 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000064 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.559673, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.315734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.316196 +INFO: TimeDuration, Event = Add_end, Time = 0.000462 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.316210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.316650 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.324228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.324692 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.324709 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.325148 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.325163 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.328866 +INFO: TimeDuration, Event = Pool_end, Time = 0.003704 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.340148 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.340407 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.340422 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.340658 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.347634 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.347885 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.347898 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.348129 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.348141 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.350937 +INFO: TimeDuration, Event = Pool_end, Time = 0.002796 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.356322 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.356484 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.356498 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.356625 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.360451 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.360612 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.360626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.360751 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.360763 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.363687 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.363706 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.363777 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.363791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.363813 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.363827 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.363877 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.040010, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.384723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.385198 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.385214 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.385654 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.392962 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.393426 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.393443 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.393880 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.393896 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.397433 +INFO: TimeDuration, Event = Pool_end, Time = 0.003537 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.408875 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.409136 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.409153 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.409385 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.416360 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.416612 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.416624 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.416857 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.416869 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.419651 +INFO: TimeDuration, Event = Pool_end, Time = 0.002782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.425041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.425204 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.425218 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.425344 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.429166 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.429327 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.429342 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.429468 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.429479 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.432408 +INFO: TimeDuration, Event = Pool_end, Time = 0.002928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.432486 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.432557 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.432570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.432592 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.432606 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.432649 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.161508, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.453686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.454152 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.454169 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.454608 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.462188 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.462651 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.462669 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.463104 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.463115 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.466662 +INFO: TimeDuration, Event = Pool_end, Time = 0.003548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.478106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.478365 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.478380 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.478611 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.485581 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.485837 +INFO: TimeDuration, Event = Add_end, Time = 0.000255 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.485850 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.486080 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.486093 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.488875 +INFO: TimeDuration, Event = Pool_end, Time = 0.002783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.494321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.494485 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.494499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.494625 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.498457 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.498618 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.498631 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.498756 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.498769 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.501691 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.501711 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.501780 +INFO: TimeDuration, Event = Mul_end, Time = 0.000069 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.501794 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.501815 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.501830 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.501881 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.269181, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.522955 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.523425 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.523440 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.523877 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.531462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.531927 +INFO: TimeDuration, Event = Add_end, Time = 0.000465 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.531943 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.532383 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.532611 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.535940 +INFO: TimeDuration, Event = Pool_end, Time = 0.003329 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.547387 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.547647 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.547664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.547894 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.554862 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.555114 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.555127 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.555357 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.555369 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.558158 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.563579 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.563741 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.563755 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.563880 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.567691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.567851 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.567864 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.567989 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.568001 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.570923 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.570942 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.571012 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.571026 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.571047 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.571062 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.571112 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.068627, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.591986 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.592455 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.592468 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.592907 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.600498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.600965 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.600981 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.601420 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.601434 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.604975 +INFO: TimeDuration, Event = Pool_end, Time = 0.003542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.616426 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.616687 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.616704 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.616935 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.623909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.624167 +INFO: TimeDuration, Event = Add_end, Time = 0.000258 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.624180 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.624411 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.624421 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.627211 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.632672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.632834 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.632847 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.632973 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.636795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.636956 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.636968 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.637094 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.637106 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.640028 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.640047 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.640117 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.640131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.640153 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.640167 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.640217 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.133871, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.660913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.661385 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.661402 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.661845 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.669401 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.669867 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.669883 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.670320 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.670333 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.673875 +INFO: TimeDuration, Event = Pool_end, Time = 0.003542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.685323 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.685583 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.685600 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.685830 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.692793 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.693044 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.693057 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.693287 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.693300 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.696088 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.701556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.701720 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.701734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.701860 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.705732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.705894 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.705907 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.706033 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.706045 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.708965 +INFO: TimeDuration, Event = Pool_end, Time = 0.002919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.708985 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.709055 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.709069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.709090 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.709104 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.709155 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.142887, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.729750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.730218 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.730235 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.730677 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.740292 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.741095 +INFO: TimeDuration, Event = Add_end, Time = 0.000802 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.741115 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.741551 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.741567 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.744874 +INFO: TimeDuration, Event = Pool_end, Time = 0.003307 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.755095 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.755352 +INFO: TimeDuration, Event = Add_end, Time = 0.000257 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.755369 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.755599 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.762587 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.762837 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.762850 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.763079 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.763091 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.765881 +INFO: TimeDuration, Event = Pool_end, Time = 0.002790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.771268 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.771429 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.771443 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.771569 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.775476 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.775636 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.775648 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.775773 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.775784 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.778712 +INFO: TimeDuration, Event = Pool_end, Time = 0.002927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.778731 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.778808 +INFO: TimeDuration, Event = Mul_end, Time = 0.000077 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.778821 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.778844 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.778862 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.778910 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000048 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 51.094697, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.799753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.800225 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.800242 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.800682 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.808264 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.808730 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.808748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.809184 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.809199 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.812742 +INFO: TimeDuration, Event = Pool_end, Time = 0.003543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.824197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.824571 +INFO: TimeDuration, Event = Add_end, Time = 0.000374 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.824591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.824820 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.831666 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.831917 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.831930 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.832160 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.832172 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.834966 +INFO: TimeDuration, Event = Pool_end, Time = 0.002793 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.840353 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.840516 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.840529 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.840656 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.844547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.844708 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.844722 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.844849 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.844861 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.847726 +INFO: TimeDuration, Event = Pool_end, Time = 0.002865 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.847745 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.847816 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.847830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.847852 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.847867 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.847919 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.016341, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.868773 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.869243 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.869258 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.869701 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.877281 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.877751 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.877767 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.878204 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.878219 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.881755 +INFO: TimeDuration, Event = Pool_end, Time = 0.003537 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.893205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.893464 +INFO: TimeDuration, Event = Add_end, Time = 0.000259 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.893481 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.893711 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.900683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.900935 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.900948 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.901178 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.901190 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.903981 +INFO: TimeDuration, Event = Pool_end, Time = 0.002791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.909369 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.909532 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.909545 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.909673 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.913498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.913658 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.913671 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.913798 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.913810 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.916728 +INFO: TimeDuration, Event = Pool_end, Time = 0.002918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.916748 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.916819 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.916832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.916854 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.916868 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.916912 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000044 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 84.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.050766, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.937754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.938227 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.938244 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.938684 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.946253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.946717 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.946734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.947170 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.947185 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.950725 +INFO: TimeDuration, Event = Pool_end, Time = 0.003540 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.962179 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.962439 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.962456 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.962688 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.969687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.969939 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.969952 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.970184 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.970196 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.972985 +INFO: TimeDuration, Event = Pool_end, Time = 0.002789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.978390 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.978558 +INFO: TimeDuration, Event = Add_end, Time = 0.000168 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.978571 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.978697 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.982554 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.982715 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352782.982728 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352782.982854 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000126 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352782.982867 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352782.985788 +INFO: TimeDuration, Event = Pool_end, Time = 0.002920 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352782.985807 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352782.985878 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352782.985892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352782.985914 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352782.985931 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352782.985983 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000052 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 85.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.291339, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.007325 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.007804 +INFO: TimeDuration, Event = Add_end, Time = 0.000479 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.007818 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.008257 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.015840 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.016312 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.016326 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.016767 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352783.016782 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352783.020329 +INFO: TimeDuration, Event = Pool_end, Time = 0.003547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.031762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.032022 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.032040 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.032270 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.039233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.039484 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.039497 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.039726 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352783.039738 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352783.042530 +INFO: TimeDuration, Event = Pool_end, Time = 0.002792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.047914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.048077 +INFO: TimeDuration, Event = Add_end, Time = 0.000163 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.048091 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.048217 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.052049 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.052211 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.052224 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.052353 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352783.052465 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352783.055285 +INFO: TimeDuration, Event = Pool_end, Time = 0.002820 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352783.055303 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352783.055375 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.055389 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.055411 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352783.055425 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352783.055475 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000050 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 83.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 49.971234, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.076248 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.076717 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.076733 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.077178 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.084746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.085214 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.085230 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.085671 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000441 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352783.085685 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 32, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352783.089224 +INFO: TimeDuration, Event = Pool_end, Time = 0.003539 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.100988 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.101249 +INFO: TimeDuration, Event = Add_end, Time = 0.000261 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.101267 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.101498 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.108495 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.108747 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.108761 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.108991 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352783.109003 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352783.111791 +INFO: TimeDuration, Event = Pool_end, Time = 0.002788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.117208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.117372 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.117386 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.117516 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.121344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.121505 +INFO: TimeDuration, Event = Add_end, Time = 0.000161 +INFO: *** TensorTanh +INFO: AbsoluteTime, Event = Tanh, Time = 1607352783.121517 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Tanh_end, Time = 1607352783.121644 +INFO: TimeDuration, Event = Tanh_end, Time = 0.000127 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352783.121656 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352783.124577 +INFO: TimeDuration, Event = Pool_end, Time = 0.002921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352783.124597 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 2048 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352783.124669 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352783.124682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352783.124703 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352783.124717 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352783.124768 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000051 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 86.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 0. +DEBUG: findTargetConfiguration: goalVal: -0.060000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 50.430090, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +Exiting profiler +INFO: Writing Runtime Profile Info File... +INFO: Done writing profile. diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..9403664de162f84dcaa420755304e5c308af51e5 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/out-run-1 @@ -0,0 +1 @@ +run_dnn_frequency_exp.sh: line 28: ./alexnet2_loop_wrapperapi_linked: No such file or directory diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/predictive/alexnet2.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/predictive/alexnet2.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ec4a06d3dbd2e088d6db287d23dd3bd5aad7ddb --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet2/predictive/alexnet2.txt @@ -0,0 +1,419 @@ +1114.3009809999999 ++++++ +conf1 1 1 84.98 0.0 +1 gpu conv fp32 11 add fp32 1 tanh fp32 1 +2 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 tanh fp32 1 +4 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 tanh fp32 1 +6 gpu conv fp32 11 add fp32 1 tanh fp32 1 pool_max fp32 1 +7 gpu mul fp32 11 add fp32 1 +8 gpu softmax fp32 1 +----- ++++++ +conf2 2.4248748377353113 2.0815908534183163 84.5 0.480000000000004 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf3 2.4055188425519614 2.0586265720811823 84.48 0.5 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 269 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf4 2.4156140842962985 2.0617867479342706 84.28 0.7000000000000028 +1 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf5 2.396416918342732 2.0506214971794585 84.02 0.960000000000008 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf6 2.463002582910052 2.1171077568609458 83.84 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf7 2.360283215266004 2.0255245321874304 83.78 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf8 2.4140791541736157 2.0671513522247653 83.74000000000001 1.2399999999999949 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf9 2.457753689612079 2.1086250651240137 83.7 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf10 2.459170454055443 2.1111925341396343 83.7 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 164 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf11 2.4135986141645764 2.060453960420927 83.62 1.3599999999999994 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf12 2.4631278039012106 2.1092094797926637 83.58 1.4000000000000057 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf13 2.535761391794481 2.16998336112692 83.58 1.4000000000000057 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf14 2.289006193945062 1.961240158652051 83.54 1.4399999999999977 +1 gpu conv perf_fp16 167 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf15 2.4257674844112573 2.0808440756495563 83.5 1.480000000000004 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 161 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf16 2.458122368488622 2.109531159729078 83.48 1.5 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf17 2.281072202152105 1.9539314420536427 83.46000000000001 1.519999999999996 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf18 2.4572171342078444 2.1088933553775697 83.46000000000001 1.519999999999996 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf19 2.3017607719030058 1.9782265708150768 83.42 1.5600000000000023 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf20 2.379206814483014 2.047909200292713 83.39999999999999 1.5800000000000125 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 151 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf21 2.4636282705302537 2.1162281156388527 83.39999999999999 1.5800000000000125 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf22 2.461590101374146 2.1108493881199184 83.22 1.7600000000000051 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 161 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf23 2.537054645442804 2.167568834938183 83.22 1.7600000000000051 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf24 2.4631604723407885 2.1099694757102845 83.17999999999999 1.8000000000000114 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf25 2.4636282705302537 2.1162281156388527 83.14 1.8400000000000034 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf26 2.462588899729088 2.109477918791931 83.14 1.8400000000000034 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf27 2.4638085754689025 2.1071960926343603 83.1 1.8800000000000097 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf28 2.4640079766123635 2.110326453157297 83.08 1.9000000000000057 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv fp16 12 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf29 2.459337622764853 2.107249218450713 83.06 1.9200000000000017 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf30 2.538176340059405 2.173287257415721 83.02000000000001 1.9599999999999937 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 164 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf31 2.3905426931959846 2.044333576277581 83.02000000000001 1.9599999999999937 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf32 2.459337622764853 2.107249218450713 83.0 1.980000000000004 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf33 2.458968579288317 2.1063450826631396 82.89999999999999 2.0800000000000125 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 163 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf34 2.2912974651603877 1.9670210508860688 82.8 2.180000000000007 +1 gpu conv perf_fp16 168 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf35 2.4648489763056327 2.113931670664391 82.66 2.3200000000000074 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf36 2.4599076869402854 2.1077397371200193 82.6 2.3800000000000097 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 267 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf37 2.4636282705302537 2.1162281156388527 82.54 2.4399999999999977 +1 gpu conv fp16 11 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 268 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 160 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- ++++++ +conf38 2.591814267389778 2.222680944458784 82.26 2.719999999999999 +1 gpu conv fp16 12 add fp16 1 tanh fp16 1 +2 gpu conv perf_fp16 154 add fp16 1 tanh fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 261 add fp16 1 tanh fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 tanh fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 tanh fp16 1 +6 gpu conv perf_fp16 157 add fp16 1 tanh fp16 1 pool_max fp16 1 +7 gpu mul fp16 12 add fp16 1 +8 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile index 877eebf8776077d02b90f30a3f669428b6929731..0e6d9e4f08d33fb62bfb20ac8eb4f86ea4ca87de 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile @@ -22,7 +22,7 @@ TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_au CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET -LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp +LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib @@ -36,11 +36,15 @@ VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LL WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt -#CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)_imagenet/data/alexnet_imagenet_loss_123_batch420_dev_tuner_valid_fp16__soc.txt VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -55,27 +59,29 @@ $(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_loop.cpp -S -o $(BUILD_DIR)/$(APP)_loop.ll + $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP).ll -S -o $(BUILD_DIR)/$(APP).visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc - $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc - + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc - $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) - $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) - + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/predictive/alexnet_imagenet.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/predictive/alexnet_imagenet.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0e42a5aaa5d7b5a06b6422a5c33a0047b6eff8d --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/predictive/alexnet_imagenet.txt @@ -0,0 +1,229 @@ +2739.950736 ++++++ +conf1 1 1 56.3 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +6 gpu mul fp32 11 add fp32 1 relu fp32 1 +7 gpu mul fp32 11 add fp32 1 relu fp32 1 +8 gpu mul fp32 11 add fp32 1 +9 gpu softmax fp32 1 +----- ++++++ +conf2 1.802133644103582 1.8186433204507424 55.76 0.5399999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf3 1.7574572103878898 1.7673706184460103 55.58 0.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf4 2.0227701930718065 2.043112495268932 55.42 0.8799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf5 1.9872634777043927 2.002789650227035 55.120000000000005 1.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf6 1.8204253918445088 1.843736069756362 54.84 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf7 1.9308336510645352 1.934889049414224 54.74 1.5599999999999952 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf8 2.0146435217865446 2.0367475358800102 54.58 1.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf9 2.0101709494490696 2.0329911158023064 54.400000000000006 1.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf10 2.0052132441967916 2.0284931705407003 54.300000000000004 1.999999999999993 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf11 2.010827434817262 2.036001862538864 54.2 2.0999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf12 2.019868378233057 2.0433540129730265 54.17999999999999 2.1200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf13 1.9923471030291253 2.009177323959059 54.120000000000005 2.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf14 1.9923471030291253 2.009177323959059 54.120000000000005 2.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf15 2.028037341700216 2.049760395549724 54.0 2.299999999999997 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf16 1.9910730364852436 2.006510848093771 53.54 2.759999999999998 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf17 2.1567475543719614 2.159142310265706 53.300000000000004 2.999999999999993 +1 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf18 2.1567475543719614 2.159142310265706 53.300000000000004 2.999999999999993 +1 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- ++++++ +conf19 2.0232690820426464 2.0527698121318476 53.300000000000004 2.999999999999993 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 11 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +5 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +6 gpu mul fp16 12 add fp16 1 relu fp16 1 +7 gpu mul fp16 12 add fp16 1 relu fp16 1 +8 gpu mul fp16 12 add fp16 1 +9 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/Makefile index 7f7b84d7e26475a6ae1cfc0bcae7d3908f3bd881..909ecfdca5a730e4d400cce70addee2d36573f1c 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/Makefile @@ -1,4 +1,5 @@ DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks +# NOTE: can configure build directory #HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/ HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT) @@ -28,18 +29,21 @@ HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib VISC_OPTFLAGS = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG -#$(APP) -PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/data/quant_ranges.txt + +PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges.txt VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-promise -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt +CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt + +VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG -CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs_base.txt +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt -VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp @@ -52,26 +56,32 @@ default: $(BUILD_DIR) $(TARGET) $(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp $(CC) $(CC_FLAGS) -emit-llvm src/$(APP).cpp -S -o $(BUILD_DIR)/$(APP).ll - #$(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll + $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_loop.cpp -S -o $(BUILD_DIR)/$(APP)_loop.ll $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP).ll -S -o $(BUILD_DIR)/$(APP).visc.ll - #$(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll + $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc - #$(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc - #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) - #$(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/predictive/lenet_mnist.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/predictive/lenet_mnist.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4e51dff426f4d3c5cb7b9572e6aa5940212acbd --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/lenet_mnist/predictive/lenet_mnist.txt @@ -0,0 +1,409 @@ +282.5141369999999 ++++++ +conf1 1 1 98.7 0.0 +1 gpu conv fp32 11 add fp32 1 pool_max fp32 1 tanh fp32 1 +2 gpu conv fp32 11 add fp32 1 pool_max fp32 1 tanh fp32 1 +3 gpu mul fp32 11 add fp32 1 tanh fp32 1 +4 gpu mul fp32 11 add fp32 1 tanh fp32 1 +5 gpu softmax fp32 1 +----- ++++++ +conf2 1.828613181003043 2.071721708828981 98.65 0.04999999999999716 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf3 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716 +1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf4 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716 +1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf5 1.8936889628815377 2.139779619692146 98.65 0.04999999999999716 +1 gpu conv perf_fp16 152 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf6 1.8247639611533713 2.0227145446958756 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf7 1.8247639611533713 2.0227145446958756 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf8 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf9 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf10 1.8406161850501603 2.037849502542524 98.64 0.060000000000002274 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf11 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274 +1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf12 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274 +1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf13 1.8663357888260776 2.115790921611576 98.64 0.060000000000002274 +1 gpu conv perf_fp16 155 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf14 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125 +1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf15 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125 +1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf16 1.8645645142051612 2.1037012333044935 98.61999999999999 0.0800000000000125 +1 gpu conv perf_fp16 167 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf17 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125 +1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf18 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125 +1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf19 2.2168527051833635 2.453341076720038 98.61999999999999 0.0800000000000125 +1 gpu conv samp_fp16 264 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf20 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf21 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf22 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 12 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf23 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf24 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf25 1.8406161850501603 2.037849502542524 98.6 0.10000000000000853 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf26 2.200653361151419 2.425091789360736 98.6 0.10000000000000853 +1 gpu conv samp_fp16 266 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf27 2.200653361151419 2.425091789360736 98.6 0.10000000000000853 +1 gpu conv samp_fp16 266 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf28 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf29 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf30 1.8406161850501603 2.037849502542524 98.58 0.12000000000000455 +1 gpu conv fp16 11 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf31 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf32 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf33 1.8445326456180258 2.087601822059355 98.58 0.12000000000000455 +1 gpu conv perf_fp16 156 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf34 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455 +1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf35 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455 +1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf36 1.8916677984300285 2.155437579874673 98.58 0.12000000000000455 +1 gpu conv perf_fp16 158 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf37 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057 +1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf38 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057 +1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf39 1.8649226857257986 2.1076025277601325 98.56 0.14000000000000057 +1 gpu conv perf_fp16 168 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf40 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057 +1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf41 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057 +1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf42 1.8463058650555446 2.067271423078985 98.56 0.14000000000000057 +1 gpu conv perf_fp16 157 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf43 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057 +1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf44 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057 +1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf45 1.9234076467497994 2.1864740913112275 98.56 0.14000000000000057 +1 gpu conv perf_fp16 153 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf46 1.8698191484268973 2.13979218727595 98.54 0.1599999999999966 +1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf47 1.8698191484268973 2.13979218727595 98.54 0.1599999999999966 +1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf48 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682 +1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf49 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682 +1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf50 1.8575043605938137 2.092057786757256 98.52 0.18000000000000682 +1 gpu conv perf_fp16 165 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- ++++++ +conf51 1.8534621507951072 2.1231113105788597 98.44000000000001 0.2599999999999909 +1 gpu conv perf_fp16 159 add fp16 1 pool_max fp16 1 tanh fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 pool_max fp16 1 tanh fp16 1 +3 gpu mul fp16 12 add fp16 1 tanh fp16 1 +4 gpu mul fp16 12 add fp16 1 tanh fp16 1 +5 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/Makefile index 3eaa8990951ef4cf2db0710cb4f7913492459849..3d678aae8feaf65bdfb9f3c04fafcb6a04505070 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/Makefile @@ -22,7 +22,7 @@ TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_au CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET -LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp +LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib @@ -35,11 +35,16 @@ PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(A VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-promise -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt -CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs_base.txt +CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -63,14 +68,20 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/predictive/mobilenet.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/predictive/mobilenet.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4d8bd893c8d9395fce6a3484d75f543f1e72da2 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/mobilenet/predictive/mobilenet.txt @@ -0,0 +1,3220 @@ +4077.307063200001 ++++++ +conf1 1 1 84.42 0.0 +1 gpu conv fp32 11 +2 gpu batchnorm fp32 11 +3 gpu relu fp32 11 +4 gpu group_conv fp32 11 +5 gpu batchnorm fp32 11 +6 gpu relu fp32 11 +7 gpu conv fp32 11 +8 gpu batchnorm fp32 11 +9 gpu relu fp32 11 +10 gpu group_conv fp32 11 +11 gpu batchnorm fp32 11 +12 gpu relu fp32 11 +13 gpu conv fp32 11 +14 gpu batchnorm fp32 11 +15 gpu relu fp32 11 +16 gpu group_conv fp32 11 +17 gpu batchnorm fp32 11 +18 gpu relu fp32 11 +19 gpu conv fp32 11 +20 gpu batchnorm fp32 11 +21 gpu relu fp32 11 +22 gpu group_conv fp32 11 +23 gpu batchnorm fp32 11 +24 gpu relu fp32 11 +25 gpu conv fp32 11 +26 gpu batchnorm fp32 11 +27 gpu relu fp32 11 +28 gpu group_conv fp32 11 +29 gpu batchnorm fp32 11 +30 gpu relu fp32 11 +31 gpu conv fp32 11 +32 gpu batchnorm fp32 11 +33 gpu relu fp32 11 +34 gpu group_conv fp32 11 +35 gpu batchnorm fp32 11 +36 gpu relu fp32 11 +37 gpu conv fp32 11 +38 gpu batchnorm fp32 11 +39 gpu relu fp32 11 +40 gpu group_conv fp32 11 +41 gpu batchnorm fp32 11 +42 gpu relu fp32 11 +43 gpu conv fp32 11 +44 gpu batchnorm fp32 11 +45 gpu relu fp32 11 +46 gpu group_conv fp32 11 +47 gpu batchnorm fp32 11 +48 gpu relu fp32 11 +49 gpu conv fp32 11 +50 gpu batchnorm fp32 11 +51 gpu relu fp32 11 +52 gpu group_conv fp32 11 +53 gpu batchnorm fp32 11 +54 gpu relu fp32 11 +55 gpu conv fp32 11 +56 gpu batchnorm fp32 11 +57 gpu relu fp32 11 +58 gpu group_conv fp32 11 +59 gpu batchnorm fp32 11 +60 gpu relu fp32 11 +61 gpu conv fp32 11 +62 gpu batchnorm fp32 11 +63 gpu relu fp32 11 +64 gpu group_conv fp32 11 +65 gpu batchnorm fp32 11 +66 gpu relu fp32 11 +67 gpu conv fp32 11 +68 gpu batchnorm fp32 11 +69 gpu relu fp32 11 +70 gpu group_conv fp32 11 +71 gpu batchnorm fp32 11 +72 gpu relu fp32 11 +73 gpu conv fp32 11 +74 gpu batchnorm fp32 11 +75 gpu relu fp32 11 +76 gpu group_conv fp32 11 +77 gpu batchnorm fp32 11 +78 gpu relu fp32 11 +79 gpu conv fp32 11 +80 gpu batchnorm fp32 11 +81 gpu relu fp32 11 +82 gpu pool_mean fp32 11 +83 gpu mul fp32 11 add fp32 1 +84 gpu softmax fp32 1 +----- ++++++ +conf2 1.4930855091460031 1.447990050940341 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv fp16 12 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf3 1.493397883226807 1.449591062426989 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 163 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf4 1.4934429016801338 1.4500582352111675 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 168 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf5 1.4938214813031556 1.450038222978811 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 157 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf6 1.4933879828131855 1.449975636202813 83.72 0.7000000000000028 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 160 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf7 1.492663093331302 1.4487067754520524 83.7 0.7199999999999989 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 167 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf8 1.495724395088184 1.4507925552157772 83.56 0.8599999999999994 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 162 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf9 1.496506307637598 1.4521705950285135 83.36 1.0600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 162 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf10 1.496532672928805 1.4521696542076958 83.36 1.0600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 156 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf11 1.4988418058849937 1.4555327556053628 83.28 1.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 164 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf12 1.4994289979945077 1.4562439330251535 83.28 1.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf13 1.4952028793065038 1.450369851058777 83.14 1.2800000000000011 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 162 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf14 1.4933978285280285 1.448265686258097 83.12 1.2999999999999972 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf15 1.491958833559989 1.4459262032919467 83.08 1.3400000000000034 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 157 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf16 1.4937317297990984 1.4498121856525021 83.02000000000001 1.3999999999999915 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf17 1.4963413808686974 1.4522391736954623 82.86 1.5600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 165 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf18 1.4942172827099065 1.4504631324933321 82.86 1.5600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 157 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf19 1.4963964073376739 1.4525461321361477 82.86 1.5600000000000023 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf20 1.4932583049858652 1.4472547227714012 82.84 1.5799999999999983 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv samp_fp16 266 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf21 1.4964326545281064 1.4526263046333605 82.82000000000001 1.5999999999999943 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf22 1.4966042483929347 1.4527859961226985 82.82000000000001 1.5999999999999943 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf23 1.4966008974318024 1.4527415844509437 82.78 1.6400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 155 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf24 1.4932738366973777 1.448820445466833 82.64 1.7800000000000011 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 164 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 157 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf25 1.4940402684133964 1.447332235394843 82.48 1.9399999999999977 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv samp_fp16 261 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf26 1.4981764588414919 1.4530714150549078 82.39999999999999 2.0200000000000102 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 161 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf27 1.5004334658773033 1.4549115105608688 82.3 2.1200000000000045 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 156 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf28 1.5006808163336343 1.4553824345285296 82.3 2.1200000000000045 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf29 1.4999870719460484 1.4571625511374704 82.28 2.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 165 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf30 1.500042366879961 1.4574715946270216 82.28 2.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf31 1.500214789632402 1.4576323532660131 82.28 2.1400000000000006 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 163 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 164 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 151 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf32 1.4927009086066445 1.4484049211953174 82.26 2.1599999999999966 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 164 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 161 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 156 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf33 1.5003438014588875 1.4538240352408085 82.22 2.200000000000003 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 152 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf34 1.5041587978616728 1.4610492456195174 82.02000000000001 2.3999999999999915 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 161 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 152 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 158 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf35 1.5000040131742656 1.4555601139156464 81.88 2.5400000000000063 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv fp16 12 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv perf_fp16 152 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 12 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv perf_fp16 161 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 151 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 151 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 167 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf36 1.4950571524902583 1.451478376045808 81.84 2.5799999999999983 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 164 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv perf_fp16 161 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 161 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 155 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- ++++++ +conf37 1.4975271575548847 1.4532126224638244 81.44 2.980000000000004 +1 gpu conv fp16 12 +2 gpu batchnorm fp16 12 +3 gpu relu fp16 12 +4 gpu group_conv fp16 12 +5 gpu batchnorm fp16 12 +6 gpu relu fp16 12 +7 gpu conv fp16 12 +8 gpu batchnorm fp16 12 +9 gpu relu fp16 12 +10 gpu group_conv fp16 12 +11 gpu batchnorm fp16 12 +12 gpu relu fp16 12 +13 gpu conv fp16 12 +14 gpu batchnorm fp16 12 +15 gpu relu fp16 12 +16 gpu group_conv fp16 12 +17 gpu batchnorm fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 164 +20 gpu batchnorm fp16 12 +21 gpu relu fp16 12 +22 gpu group_conv fp16 12 +23 gpu batchnorm fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu group_conv fp16 12 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 +32 gpu batchnorm fp16 12 +33 gpu relu fp16 12 +34 gpu group_conv fp16 12 +35 gpu batchnorm fp16 12 +36 gpu relu fp16 12 +37 gpu conv fp16 11 +38 gpu batchnorm fp16 12 +39 gpu relu fp16 12 +40 gpu group_conv fp16 12 +41 gpu batchnorm fp16 12 +42 gpu relu fp16 12 +43 gpu conv fp16 12 +44 gpu batchnorm fp16 12 +45 gpu relu fp16 12 +46 gpu group_conv fp16 12 +47 gpu batchnorm fp16 12 +48 gpu relu fp16 12 +49 gpu conv perf_fp16 155 +50 gpu batchnorm fp16 12 +51 gpu relu fp16 12 +52 gpu group_conv fp16 12 +53 gpu batchnorm fp16 12 +54 gpu relu fp16 12 +55 gpu conv perf_fp16 155 +56 gpu batchnorm fp16 12 +57 gpu relu fp16 12 +58 gpu group_conv fp16 12 +59 gpu batchnorm fp16 12 +60 gpu relu fp16 12 +61 gpu conv perf_fp16 151 +62 gpu batchnorm fp16 12 +63 gpu relu fp16 12 +64 gpu group_conv fp16 12 +65 gpu batchnorm fp16 12 +66 gpu relu fp16 12 +67 gpu conv perf_fp16 155 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu group_conv fp16 12 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv perf_fp16 152 +74 gpu batchnorm fp16 12 +75 gpu relu fp16 12 +76 gpu group_conv fp16 12 +77 gpu batchnorm fp16 12 +78 gpu relu fp16 12 +79 gpu conv perf_fp16 153 +80 gpu batchnorm fp16 12 +81 gpu relu fp16 12 +82 gpu pool_mean fp16 12 +83 gpu mul fp16 12 add fp16 1 +84 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/Makefile index 1b92bacec182ab4efaf59b6d7061732c6367c5ec..4de3565fce81f857beb3dd4ee60e7423bdd0322a 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/Makefile @@ -22,7 +22,7 @@ TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_au CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET -LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp +LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib @@ -35,11 +35,16 @@ PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(A VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-promise -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt -CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs_base.txt +CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -63,14 +68,20 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/run_data/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/run_data/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..4cdf31cb691525b3239749ffb90e18a3b9c10b21 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/data/run_data/out-run-1 @@ -0,0 +1,12111 @@ +size_in_bytes = 1728 +DEBUG: ***--- size_in_bytes = 1728 +DEBUG: Attempting to Allocate = 1728 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 27, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 64 +DEBUG: ***--- size_in_bytes = 64 +DEBUG: Attempting to Allocate = 64 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9216 +DEBUG: ***--- size_in_bytes = 9216 +DEBUG: Attempting to Allocate = 9216 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 144, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 64 +DEBUG: ***--- size_in_bytes = 64 +DEBUG: Attempting to Allocate = 64 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9216 +DEBUG: ***--- size_in_bytes = 9216 +DEBUG: Attempting to Allocate = 9216 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 144, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 64 +DEBUG: ***--- size_in_bytes = 64 +DEBUG: Attempting to Allocate = 64 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9216 +DEBUG: ***--- size_in_bytes = 9216 +DEBUG: Attempting to Allocate = 9216 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 144, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 64 +DEBUG: ***--- size_in_bytes = 64 +DEBUG: Attempting to Allocate = 64 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9216 +DEBUG: ***--- size_in_bytes = 9216 +DEBUG: Attempting to Allocate = 9216 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 144, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 64 +DEBUG: ***--- size_in_bytes = 64 +DEBUG: Attempting to Allocate = 64 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9216 +DEBUG: ***--- size_in_bytes = 9216 +DEBUG: Attempting to Allocate = 9216 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 144, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 64 +DEBUG: ***--- size_in_bytes = 64 +DEBUG: Attempting to Allocate = 64 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9216 +DEBUG: ***--- size_in_bytes = 9216 +DEBUG: Attempting to Allocate = 9216 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 144, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 64 +DEBUG: ***--- size_in_bytes = 64 +DEBUG: Attempting to Allocate = 64 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 18432 +DEBUG: ***--- size_in_bytes = 18432 +DEBUG: Attempting to Allocate = 18432 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 144, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 36864 +DEBUG: ***--- size_in_bytes = 36864 +DEBUG: Attempting to Allocate = 36864 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 36864 +DEBUG: ***--- size_in_bytes = 36864 +DEBUG: Attempting to Allocate = 36864 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 36864 +DEBUG: ***--- size_in_bytes = 36864 +DEBUG: Attempting to Allocate = 36864 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 36864 +DEBUG: ***--- size_in_bytes = 36864 +DEBUG: Attempting to Allocate = 36864 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 36864 +DEBUG: ***--- size_in_bytes = 36864 +DEBUG: Attempting to Allocate = 36864 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 128 +DEBUG: ***--- size_in_bytes = 128 +DEBUG: Attempting to Allocate = 128 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 73728 +DEBUG: ***--- size_in_bytes = 73728 +DEBUG: Attempting to Allocate = 73728 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 288, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 8192 +DEBUG: ***--- size_in_bytes = 8192 +DEBUG: Attempting to Allocate = 8192 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2560 +DEBUG: ***--- size_in_bytes = 2560 +DEBUG: Attempting to Allocate = 2560 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 640, cStride = 640, hStride = 10, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 40 +DEBUG: ***--- size_in_bytes = 40 +DEBUG: Attempting to Allocate = 40 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INITIALIZING GPU 0 +CREATED HANDLES 0 +INFO: +WARNING: File 'opentuner_flags' not found + + +initializing tuner .... +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +Read PROMISE FLAGS 0 +DONE INTIALIZING GPU 0 +INFO: Reading Quantization Ranges File... +INFO: DONE. +INFO: Reading Configuration File... +DEBUG: first_line: 2000 +DEBUG: Baseline time: 2000.000000 + +DEBUG: line: +++++ +DEBUG: t: +++++ +DEBUG: +DEBUG: line: conf1 1 0 89.59 0 +DEBUG: t: conf1 +DEBUG: t: 1 +DEBUG: t: 0 +DEBUG: t: 89.59 +DEBUG: t: 0 +DEBUG: +DEBUG: line: 1 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 1 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 1 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 2 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 2 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 4 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 3 gpu conv fp32 1 add fp32 1 +DEBUG: t: 3 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 7 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 4 gpu add fp32 1 +DEBUG: t: 4 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 9 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 5 gpu relu fp32 1 +DEBUG: t: 5 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 10 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 6 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 6 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 11 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 7 gpu conv fp32 1 add fp32 1 +DEBUG: t: 7 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 14 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 8 gpu add fp32 1 +DEBUG: t: 8 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 16 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 9 gpu relu fp32 1 +DEBUG: t: 9 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 17 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 10 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 10 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 18 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 11 gpu conv fp32 1 add fp32 1 +DEBUG: t: 11 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 21 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 12 gpu add fp32 1 +DEBUG: t: 12 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 23 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 13 gpu relu fp32 1 +DEBUG: t: 13 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 24 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 14 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 14 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 25 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 15 gpu conv fp32 1 add fp32 1 +DEBUG: t: 15 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 28 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 16 gpu conv fp32 1 add fp32 1 +DEBUG: t: 16 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 30 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 17 gpu add fp32 1 +DEBUG: t: 17 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 32 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 18 gpu relu fp32 1 +DEBUG: t: 18 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 33 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 19 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 19 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 34 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 20 gpu conv fp32 1 add fp32 1 +DEBUG: t: 20 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 37 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 21 gpu add fp32 1 +DEBUG: t: 21 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 39 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 22 gpu relu fp32 1 +DEBUG: t: 22 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 40 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 23 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 23 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 41 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 24 gpu conv fp32 1 add fp32 1 +DEBUG: t: 24 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 44 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 25 gpu add fp32 1 +DEBUG: t: 25 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 46 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 26 gpu relu fp32 1 +DEBUG: t: 26 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 47 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 27 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 27 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 48 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 28 gpu conv fp32 1 add fp32 1 +DEBUG: t: 28 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 51 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 29 gpu conv fp32 1 add fp32 1 +DEBUG: t: 29 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 53 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 30 gpu add fp32 1 +DEBUG: t: 30 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 55 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 31 gpu relu fp32 1 +DEBUG: t: 31 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 56 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 32 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 32 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 57 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 33 gpu conv fp32 1 add fp32 1 +DEBUG: t: 33 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 60 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 34 gpu add fp32 1 +DEBUG: t: 34 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 62 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 35 gpu relu fp32 1 +DEBUG: t: 35 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 63 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 36 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 36 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 64 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 37 gpu conv fp32 1 add fp32 1 +DEBUG: t: 37 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 67 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 38 gpu add fp32 1 +DEBUG: t: 38 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 69 + +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 39 gpu relu fp32 1 +DEBUG: t: 39 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 70 + +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 40 gpu pool_mean fp32 1 +DEBUG: t: 40 +DEBUG: t: gpu +DEBUG: t: pool_mean +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 71 + +DEBUG: Found pool_mean operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 41 gpu mul fp32 1 add fp32 1 +DEBUG: t: 41 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 72 + +DEBUG: Found mul operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 42 gpu softmax fp32 1 +DEBUG: t: 42 +DEBUG: t: gpu +DEBUG: t: softmax +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 74 + +DEBUG: Found softmax operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: ----- +DEBUG: t: ----- +DEBUG: +DEBUG: line: +++++ +DEBUG: t: +++++ +DEBUG: +DEBUG: line: conf2 1.5 0 89.59 0 +DEBUG: t: conf2 +DEBUG: t: 1.5 +DEBUG: t: 0 +DEBUG: t: 89.59 +DEBUG: t: 0 +DEBUG: +DEBUG: line: 1 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 1 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 1 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 2 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 2 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 4 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 3 gpu conv fp16 1 add fp16 1 +DEBUG: t: 3 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 7 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 4 gpu add fp16 1 +DEBUG: t: 4 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 9 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 5 gpu relu fp16 1 +DEBUG: t: 5 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 10 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 6 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 6 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 11 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 7 gpu conv fp16 1 add fp16 1 +DEBUG: t: 7 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 14 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 8 gpu add fp16 1 +DEBUG: t: 8 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 16 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 9 gpu relu fp16 1 +DEBUG: t: 9 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 17 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 10 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 10 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 18 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 11 gpu conv fp16 1 add fp16 1 +DEBUG: t: 11 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 21 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 12 gpu add fp16 1 +DEBUG: t: 12 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 23 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 13 gpu relu fp16 1 +DEBUG: t: 13 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 24 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 14 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 14 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 25 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 15 gpu conv fp16 1 add fp16 1 +DEBUG: t: 15 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 28 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 16 gpu conv fp16 1 add fp16 1 +DEBUG: t: 16 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 30 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 17 gpu add fp16 1 +DEBUG: t: 17 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 32 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 18 gpu relu fp16 1 +DEBUG: t: 18 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 33 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 19 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 19 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 34 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 20 gpu conv fp16 1 add fp16 1 +DEBUG: t: 20 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 37 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 21 gpu add fp16 1 +DEBUG: t: 21 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 39 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 22 gpu relu fp16 1 +DEBUG: t: 22 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 40 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 23 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 23 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 41 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 24 gpu conv fp16 1 add fp16 1 +DEBUG: t: 24 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 44 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 25 gpu add fp16 1 +DEBUG: t: 25 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 46 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 26 gpu relu fp16 1 +DEBUG: t: 26 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 47 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 27 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 27 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 48 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 28 gpu conv fp16 1 add fp16 1 +DEBUG: t: 28 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 51 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 29 gpu conv fp16 1 add fp16 1 +DEBUG: t: 29 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 53 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 30 gpu add fp16 1 +DEBUG: t: 30 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 55 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 31 gpu relu fp16 1 +DEBUG: t: 31 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 56 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 32 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 32 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 57 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 33 gpu conv fp16 1 add fp16 1 +DEBUG: t: 33 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 60 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 34 gpu add fp16 1 +DEBUG: t: 34 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 62 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 35 gpu relu fp16 1 +DEBUG: t: 35 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 63 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 36 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 36 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 64 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 37 gpu conv fp16 1 add fp16 1 +DEBUG: t: 37 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 67 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 38 gpu add fp16 1 +DEBUG: t: 38 +DEBUG: t: gpu +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 69 + +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 39 gpu relu fp16 1 +DEBUG: t: 39 +DEBUG: t: gpu +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 70 + +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 40 gpu pool_mean fp16 1 +DEBUG: t: 40 +DEBUG: t: gpu +DEBUG: t: pool_mean +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 71 + +DEBUG: Found pool_mean operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 41 gpu mul fp16 1 add fp16 1 +DEBUG: t: 41 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 72 + +DEBUG: Found mul operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 42 gpu softmax fp32 1 +DEBUG: t: 42 +DEBUG: t: gpu +DEBUG: t: softmax +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 74 + +DEBUG: Found softmax operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: ----- +DEBUG: t: ----- +DEBUG: +DEBUG: DONE. +INFO: Sorting autotuner configurations... +INFO: Done sorting. +DEBUG: start_idx = 1, end_idx = 2 +DEBUG: accuracy loss = 0.000000, speedup = 1.500000, at sp_idx = 1 +DEBUG: accuracy loss = 0.000000, energy = 0.000000, at en_idx = 1 +DEBUG: sp_notDominated = 1 +DEBUG: en_notDominated = 0 +INFO: Speedup Configurations ++++++ +conf1 1.000000 0.000000 89.589996 0.000000 +1 : gpu conv fp32 1 add fp32 1 relu fp32 1 +10 : gpu conv fp32 1 add fp32 1 relu fp32 1 +11 : gpu conv fp32 1 add fp32 1 +12 : gpu add fp32 1 +13 : gpu relu fp32 1 +14 : gpu conv fp32 1 add fp32 1 relu fp32 1 +15 : gpu conv fp32 1 add fp32 1 +16 : gpu conv fp32 1 add fp32 1 +17 : gpu add fp32 1 +18 : gpu relu fp32 1 +19 : gpu conv fp32 1 add fp32 1 relu fp32 1 +2 : gpu conv fp32 1 add fp32 1 relu fp32 1 +20 : gpu conv fp32 1 add fp32 1 +21 : gpu add fp32 1 +22 : gpu relu fp32 1 +23 : gpu conv fp32 1 add fp32 1 relu fp32 1 +24 : gpu conv fp32 1 add fp32 1 +25 : gpu add fp32 1 +26 : gpu relu fp32 1 +27 : gpu conv fp32 1 add fp32 1 relu fp32 1 +28 : gpu conv fp32 1 add fp32 1 +29 : gpu conv fp32 1 add fp32 1 +3 : gpu conv fp32 1 add fp32 1 +30 : gpu add fp32 1 +31 : gpu relu fp32 1 +32 : gpu conv fp32 1 add fp32 1 relu fp32 1 +33 : gpu conv fp32 1 add fp32 1 +34 : gpu add fp32 1 +35 : gpu relu fp32 1 +36 : gpu conv fp32 1 add fp32 1 relu fp32 1 +37 : gpu conv fp32 1 add fp32 1 +38 : gpu add fp32 1 +39 : gpu relu fp32 1 +4 : gpu add fp32 1 +40 : gpu pool_mean fp32 1 +41 : gpu mul fp32 1 add fp32 1 +42 : gpu softmax fp32 1 +5 : gpu relu fp32 1 +6 : gpu conv fp32 1 add fp32 1 relu fp32 1 +7 : gpu conv fp32 1 add fp32 1 +8 : gpu add fp32 1 +9 : gpu relu fp32 1 +----- ++++++ +conf2 1.500000 0.000000 89.589996 0.000000 +1 : gpu conv fp16 1 add fp16 1 relu fp16 1 +10 : gpu conv fp16 1 add fp16 1 relu fp16 1 +11 : gpu conv fp16 1 add fp16 1 +12 : gpu add fp16 1 +13 : gpu relu fp16 1 +14 : gpu conv fp16 1 add fp16 1 relu fp16 1 +15 : gpu conv fp16 1 add fp16 1 +16 : gpu conv fp16 1 add fp16 1 +17 : gpu add fp16 1 +18 : gpu relu fp16 1 +19 : gpu conv fp16 1 add fp16 1 relu fp16 1 +2 : gpu conv fp16 1 add fp16 1 relu fp16 1 +20 : gpu conv fp16 1 add fp16 1 +21 : gpu add fp16 1 +22 : gpu relu fp16 1 +23 : gpu conv fp16 1 add fp16 1 relu fp16 1 +24 : gpu conv fp16 1 add fp16 1 +25 : gpu add fp16 1 +26 : gpu relu fp16 1 +27 : gpu conv fp16 1 add fp16 1 relu fp16 1 +28 : gpu conv fp16 1 add fp16 1 +29 : gpu conv fp16 1 add fp16 1 +3 : gpu conv fp16 1 add fp16 1 +30 : gpu add fp16 1 +31 : gpu relu fp16 1 +32 : gpu conv fp16 1 add fp16 1 relu fp16 1 +33 : gpu conv fp16 1 add fp16 1 +34 : gpu add fp16 1 +35 : gpu relu fp16 1 +36 : gpu conv fp16 1 add fp16 1 relu fp16 1 +37 : gpu conv fp16 1 add fp16 1 +38 : gpu add fp16 1 +39 : gpu relu fp16 1 +4 : gpu add fp16 1 +40 : gpu pool_mean fp16 1 +41 : gpu mul fp16 1 add fp16 1 +42 : gpu softmax fp32 1 +5 : gpu relu fp16 1 +6 : gpu conv fp16 1 add fp16 1 relu fp16 1 +7 : gpu conv fp16 1 add fp16 1 +8 : gpu add fp16 1 +9 : gpu relu fp16 1 +----- +DEBUG: slowdowns file not found. Initializing slowdowns randomly. +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +WARNING: pause_profiler was already called +Initializing policy object ... +DONE: Initializing policy object. +Select target device (0 for CPU, 1 fpr GPU): DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +INFO: Moving 1728 bytes from host to GPU +INFO: Moving 64 bytes from host to GPU +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.102505 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.102816 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.102857 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.103118 +INFO: TimeDuration, Event = Relu_end, Time = 0.000261 +DEBUG: No data movement required - Data on Device +INFO: Moving 9216 bytes from host to GPU +INFO: Moving 64 bytes from host to GPU +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.111648 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.111923 +INFO: TimeDuration, Event = Add_end, Time = 0.000275 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.111961 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.112204 +INFO: TimeDuration, Event = Relu_end, Time = 0.000243 +DEBUG: No data movement required - Data on Device +INFO: Moving 9216 bytes from host to GPU +INFO: Moving 64 bytes from host to GPU +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.124980 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.125228 +INFO: TimeDuration, Event = Add_end, Time = 0.000248 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.125249 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.125595 +INFO: TimeDuration, Event = Add_end, Time = 0.000346 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.125616 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.125844 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +INFO: Moving 9216 bytes from host to GPU +INFO: Moving 64 bytes from host to GPU +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.140857 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.141132 +INFO: TimeDuration, Event = Add_end, Time = 0.000276 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.141151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.141383 +INFO: TimeDuration, Event = Relu_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +INFO: Moving 9216 bytes from host to GPU +INFO: Moving 64 bytes from host to GPU +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.152460 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.152709 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.152729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.153073 +INFO: TimeDuration, Event = Add_end, Time = 0.000344 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.153091 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.153316 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +INFO: Moving 9216 bytes from host to GPU +INFO: Moving 64 bytes from host to GPU +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.167899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.168150 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.168166 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.168405 +INFO: TimeDuration, Event = Relu_end, Time = 0.000238 +DEBUG: No data movement required - Data on Device +INFO: Moving 9216 bytes from host to GPU +INFO: Moving 64 bytes from host to GPU +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.181180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.181426 +INFO: TimeDuration, Event = Add_end, Time = 0.000246 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.181445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.181792 +INFO: TimeDuration, Event = Add_end, Time = 0.000347 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.181810 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.182039 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +INFO: Moving 18432 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.190006 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.190155 +INFO: TimeDuration, Event = Add_end, Time = 0.000149 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.190174 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.190299 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +INFO: Moving 2048 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.192734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.192873 +INFO: TimeDuration, Event = Add_end, Time = 0.000139 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: Moving 36864 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.195304 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.195446 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.195465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.195679 +INFO: TimeDuration, Event = Add_end, Time = 0.000214 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.195696 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.195821 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +INFO: Moving 36864 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.201139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.201280 +INFO: TimeDuration, Event = Add_end, Time = 0.000141 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.201296 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.201422 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +INFO: Moving 36864 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.206897 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.207034 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.207048 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.207248 +INFO: TimeDuration, Event = Add_end, Time = 0.000200 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.207261 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.207381 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +INFO: Moving 36864 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.212560 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.212699 +INFO: TimeDuration, Event = Add_end, Time = 0.000139 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.212711 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.212834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +INFO: Moving 36864 bytes from host to GPU +INFO: Moving 128 bytes from host to GPU +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.218241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.218378 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.218392 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.218592 +INFO: TimeDuration, Event = Add_end, Time = 0.000200 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.218608 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.218728 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +INFO: Moving 73728 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.222892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.222973 +INFO: TimeDuration, Event = Add_end, Time = 0.000081 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.222986 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.223056 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +INFO: Moving 8192 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.224449 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.224525 +INFO: TimeDuration, Event = Add_end, Time = 0.000076 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: Moving 147456 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.226019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.226102 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.226116 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.226216 +INFO: TimeDuration, Event = Add_end, Time = 0.000100 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.226228 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.226297 +INFO: TimeDuration, Event = Relu_end, Time = 0.000068 +DEBUG: No data movement required - Data on Device +INFO: Moving 147456 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.229331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.229415 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.229427 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.229497 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +INFO: Moving 147456 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.232643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.232727 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.232741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.232841 +INFO: TimeDuration, Event = Add_end, Time = 0.000100 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.232853 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.232921 +INFO: TimeDuration, Event = Relu_end, Time = 0.000068 +DEBUG: No data movement required - Data on Device +INFO: Moving 147456 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.235954 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.236036 +INFO: TimeDuration, Event = Add_end, Time = 0.000082 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.236087 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.236157 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +INFO: Moving 147456 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.239252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.239334 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.239349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.239450 +INFO: TimeDuration, Event = Add_end, Time = 0.000101 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.239463 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.239530 +INFO: TimeDuration, Event = Relu_end, Time = 0.000068 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352815.239544 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352815.239671 +INFO: TimeDuration, Event = Pool_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +INFO: Moving 2560 bytes from host to GPU +INFO: Moving 40 bytes from host to GPU +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352815.239701 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352815.239752 +INFO: TimeDuration, Event = Mul_end, Time = 0.000051 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.239765 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.239789 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352815.239802 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352815.239855 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000053 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 91.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 127.039922, current iteration energy = 0.000000 + +RUNNING BATCH = 0 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.319836 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.320084 +INFO: TimeDuration, Event = Add_end, Time = 0.000248 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.320188 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.320417 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.327363 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.327602 +INFO: TimeDuration, Event = Add_end, Time = 0.000240 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.327618 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.327842 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.340800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.341040 +INFO: TimeDuration, Event = Add_end, Time = 0.000240 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.341053 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.341396 +INFO: TimeDuration, Event = Add_end, Time = 0.000343 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.341410 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.341634 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.353894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.354133 +INFO: TimeDuration, Event = Add_end, Time = 0.000240 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.354145 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.354369 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.366997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.367239 +INFO: TimeDuration, Event = Add_end, Time = 0.000242 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.367253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.367594 +INFO: TimeDuration, Event = Add_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.367605 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.367827 +INFO: TimeDuration, Event = Relu_end, Time = 0.000222 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.380150 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.380391 +INFO: TimeDuration, Event = Add_end, Time = 0.000241 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.380433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.380659 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.393281 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.393520 +INFO: TimeDuration, Event = Add_end, Time = 0.000238 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.393534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.393875 +INFO: TimeDuration, Event = Add_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.393887 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.394109 +INFO: TimeDuration, Event = Relu_end, Time = 0.000222 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.401817 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.401954 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.401966 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.402087 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.404439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.404573 +INFO: TimeDuration, Event = Add_end, Time = 0.000134 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.406726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.406862 +INFO: TimeDuration, Event = Add_end, Time = 0.000136 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.406877 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.407078 +INFO: TimeDuration, Event = Add_end, Time = 0.000201 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.407089 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.407208 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.412396 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.412533 +INFO: TimeDuration, Event = Add_end, Time = 0.000136 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.412544 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.412665 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.418060 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.418196 +INFO: TimeDuration, Event = Add_end, Time = 0.000136 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.418211 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.418410 +INFO: TimeDuration, Event = Add_end, Time = 0.000200 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.418422 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.418542 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.423717 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.423855 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.423866 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.423989 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.429380 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.429517 +INFO: TimeDuration, Event = Add_end, Time = 0.000136 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.429530 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.429730 +INFO: TimeDuration, Event = Add_end, Time = 0.000200 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.429742 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.429863 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.433891 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.433972 +INFO: TimeDuration, Event = Add_end, Time = 0.000081 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.433984 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.434053 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.435350 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.435433 +INFO: TimeDuration, Event = Add_end, Time = 0.000082 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.436785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.436867 +INFO: TimeDuration, Event = Add_end, Time = 0.000082 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.436905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.437006 +INFO: TimeDuration, Event = Add_end, Time = 0.000101 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.437018 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.437088 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.439907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.439991 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.440002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.440073 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.443053 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.443136 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.443150 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.443249 +INFO: TimeDuration, Event = Add_end, Time = 0.000100 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.443261 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.443330 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.446179 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.446262 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.446282 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.446353 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.449319 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.449400 +INFO: TimeDuration, Event = Add_end, Time = 0.000081 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.449414 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.449513 +INFO: TimeDuration, Event = Add_end, Time = 0.000099 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.449525 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.449593 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352815.449607 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352815.449712 +INFO: TimeDuration, Event = Pool_end, Time = 0.000106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352815.449728 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352815.449768 +INFO: TimeDuration, Event = Mul_end, Time = 0.000040 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.449780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.449800 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352815.449812 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352815.449854 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 90.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 131.841036, current iteration energy = 0.000000 + +RUNNING BATCH = 1 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.524017 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.524258 +INFO: TimeDuration, Event = Add_end, Time = 0.000241 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.524402 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.524627 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.531380 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.531624 +INFO: TimeDuration, Event = Add_end, Time = 0.000244 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.531638 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.531863 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.544822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.545060 +INFO: TimeDuration, Event = Add_end, Time = 0.000238 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.545076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.545418 +INFO: TimeDuration, Event = Add_end, Time = 0.000342 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.545432 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.545657 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.557934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.558172 +INFO: TimeDuration, Event = Add_end, Time = 0.000239 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.558185 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.558409 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.571023 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.571261 +INFO: TimeDuration, Event = Add_end, Time = 0.000239 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.571275 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.571615 +INFO: TimeDuration, Event = Add_end, Time = 0.000340 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.571627 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.571850 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.584172 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.584410 +INFO: TimeDuration, Event = Add_end, Time = 0.000238 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.584437 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.584663 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.597311 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.597550 +INFO: TimeDuration, Event = Add_end, Time = 0.000239 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.597564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.597905 +INFO: TimeDuration, Event = Add_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.597916 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.598140 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.605851 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.605987 +INFO: TimeDuration, Event = Add_end, Time = 0.000136 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.605998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.606119 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.608448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.608582 +INFO: TimeDuration, Event = Add_end, Time = 0.000135 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.610720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.610860 +INFO: TimeDuration, Event = Add_end, Time = 0.000140 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.610874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.611074 +INFO: TimeDuration, Event = Add_end, Time = 0.000200 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.611086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.611205 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.616428 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.616567 +INFO: TimeDuration, Event = Add_end, Time = 0.000139 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.616580 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.616702 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.622088 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.622225 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.622237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.622436 +INFO: TimeDuration, Event = Add_end, Time = 0.000198 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.622448 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.622567 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.627748 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.627886 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.627897 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.628019 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.633434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.633569 +INFO: TimeDuration, Event = Add_end, Time = 0.000135 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.633582 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.633781 +INFO: TimeDuration, Event = Add_end, Time = 0.000198 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.633793 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.633913 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.637933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.638014 +INFO: TimeDuration, Event = Add_end, Time = 0.000081 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.638026 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.638095 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.639387 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.639468 +INFO: TimeDuration, Event = Add_end, Time = 0.000081 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.640817 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.640899 +INFO: TimeDuration, Event = Add_end, Time = 0.000082 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.640913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.641014 +INFO: TimeDuration, Event = Add_end, Time = 0.000101 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.641026 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.641094 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.643947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.644031 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.644043 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.644113 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.647079 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.647162 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.647176 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.647275 +INFO: TimeDuration, Event = Add_end, Time = 0.000099 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.647288 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.647357 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.650207 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.650289 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.650308 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.650377 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.653346 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.653428 +INFO: TimeDuration, Event = Add_end, Time = 0.000082 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.653441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.653540 +INFO: TimeDuration, Event = Add_end, Time = 0.000099 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.653552 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.653620 +INFO: TimeDuration, Event = Relu_end, Time = 0.000068 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352815.653634 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352815.653739 +INFO: TimeDuration, Event = Pool_end, Time = 0.000106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352815.653754 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352815.653794 +INFO: TimeDuration, Event = Mul_end, Time = 0.000040 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.653807 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.653826 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352815.653838 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352815.653880 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000042 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 88.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 131.652881, current iteration energy = 0.000000 + +RUNNING BATCH = 2 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.732998 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.733280 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.733389 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.733637 +INFO: TimeDuration, Event = Relu_end, Time = 0.000248 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.741266 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.741511 +INFO: TimeDuration, Event = Add_end, Time = 0.000246 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.741531 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.741760 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.757149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.757433 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.757482 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.757834 +INFO: TimeDuration, Event = Add_end, Time = 0.000353 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.757866 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.758107 +INFO: TimeDuration, Event = Relu_end, Time = 0.000241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.770838 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.771090 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.771106 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.771341 +INFO: TimeDuration, Event = Relu_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.784111 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.784358 +INFO: TimeDuration, Event = Add_end, Time = 0.000247 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.784562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.784905 +INFO: TimeDuration, Event = Add_end, Time = 0.000342 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.784921 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.785150 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.797825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.798078 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.798094 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.798325 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.811171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.811420 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.811440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.811786 +INFO: TimeDuration, Event = Add_end, Time = 0.000347 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.811803 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.812033 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.819966 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.820128 +INFO: TimeDuration, Event = Add_end, Time = 0.000162 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.820145 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.820280 +INFO: TimeDuration, Event = Relu_end, Time = 0.000135 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.822676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.822817 +INFO: TimeDuration, Event = Add_end, Time = 0.000140 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.825196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.825339 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.825368 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.825582 +INFO: TimeDuration, Event = Add_end, Time = 0.000214 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.825599 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.825724 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.831039 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.831182 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.831198 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.831324 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.836881 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.837024 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.837042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.837246 +INFO: TimeDuration, Event = Add_end, Time = 0.000204 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.837263 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.837388 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.842716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.842859 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.842875 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.843000 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.848573 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.848714 +INFO: TimeDuration, Event = Add_end, Time = 0.000141 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.848732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.848936 +INFO: TimeDuration, Event = Add_end, Time = 0.000204 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.848952 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.849075 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.853193 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.853278 +INFO: TimeDuration, Event = Add_end, Time = 0.000086 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.853294 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.853367 +INFO: TimeDuration, Event = Relu_end, Time = 0.000073 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.854795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.854880 +INFO: TimeDuration, Event = Add_end, Time = 0.000085 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.856679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.856767 +INFO: TimeDuration, Event = Add_end, Time = 0.000088 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.856785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.856889 +INFO: TimeDuration, Event = Add_end, Time = 0.000104 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.856905 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.856977 +INFO: TimeDuration, Event = Relu_end, Time = 0.000072 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.859990 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.860078 +INFO: TimeDuration, Event = Add_end, Time = 0.000088 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.860093 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.860168 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.863287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.863377 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.863394 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.863498 +INFO: TimeDuration, Event = Add_end, Time = 0.000103 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.863515 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.863588 +INFO: TimeDuration, Event = Relu_end, Time = 0.000073 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.866566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.866654 +INFO: TimeDuration, Event = Add_end, Time = 0.000088 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.866699 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.866774 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.869789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.869873 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.869888 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.869990 +INFO: TimeDuration, Event = Add_end, Time = 0.000102 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.870003 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.870072 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352815.870085 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352815.870234 +INFO: TimeDuration, Event = Pool_end, Time = 0.000149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352815.870251 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352815.870304 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.870318 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.870347 +INFO: TimeDuration, Event = Add_end, Time = 0.000029 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352815.870361 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352815.870435 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000074 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 89.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.508902, current iteration energy = 0.000000 + +RUNNING BATCH = 3 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.955174 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.955447 +INFO: TimeDuration, Event = Add_end, Time = 0.000272 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.955472 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.955711 +INFO: TimeDuration, Event = Relu_end, Time = 0.000239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.962854 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.963097 +INFO: TimeDuration, Event = Add_end, Time = 0.000243 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.963115 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.963343 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.976336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.976580 +INFO: TimeDuration, Event = Add_end, Time = 0.000244 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.976597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.976941 +INFO: TimeDuration, Event = Add_end, Time = 0.000345 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.976959 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.977186 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352815.989471 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352815.989713 +INFO: TimeDuration, Event = Add_end, Time = 0.000243 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352815.989726 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352815.989950 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.002597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.002844 +INFO: TimeDuration, Event = Add_end, Time = 0.000246 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.002859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.003202 +INFO: TimeDuration, Event = Add_end, Time = 0.000343 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.003215 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.003440 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.015772 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.016014 +INFO: TimeDuration, Event = Add_end, Time = 0.000242 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.016026 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.016253 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.028939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.029184 +INFO: TimeDuration, Event = Add_end, Time = 0.000245 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.029237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.029578 +INFO: TimeDuration, Event = Add_end, Time = 0.000341 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.029592 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.029819 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.037504 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.037647 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.037660 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.037785 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.040118 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.040253 +INFO: TimeDuration, Event = Add_end, Time = 0.000135 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.042439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.042578 +INFO: TimeDuration, Event = Add_end, Time = 0.000138 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.042592 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.042794 +INFO: TimeDuration, Event = Add_end, Time = 0.000201 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.042806 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.042927 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.048126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.048265 +INFO: TimeDuration, Event = Add_end, Time = 0.000139 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.048276 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.048399 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.053835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.053974 +INFO: TimeDuration, Event = Add_end, Time = 0.000139 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.053990 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.054190 +INFO: TimeDuration, Event = Add_end, Time = 0.000200 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.054202 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.054324 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.059535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.059674 +INFO: TimeDuration, Event = Add_end, Time = 0.000138 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.059686 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.059809 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.065242 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.065379 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.065394 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.065595 +INFO: TimeDuration, Event = Add_end, Time = 0.000201 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.065607 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.065729 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.069763 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.069846 +INFO: TimeDuration, Event = Add_end, Time = 0.000083 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.069859 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.069929 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.071272 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.071351 +INFO: TimeDuration, Event = Add_end, Time = 0.000079 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.072733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.072817 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.072831 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.072933 +INFO: TimeDuration, Event = Add_end, Time = 0.000102 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.072945 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.073016 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.075892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.075976 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.076012 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.076083 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.079038 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.079123 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.079137 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.079238 +INFO: TimeDuration, Event = Add_end, Time = 0.000101 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.079252 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.079323 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.082177 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.082262 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.082296 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.082368 +INFO: TimeDuration, Event = Relu_end, Time = 0.000072 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.085394 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.085478 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.085492 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.085593 +INFO: TimeDuration, Event = Add_end, Time = 0.000101 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.085605 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.085675 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352816.085689 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352816.085803 +INFO: TimeDuration, Event = Pool_end, Time = 0.000113 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352816.085818 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352816.085875 +INFO: TimeDuration, Event = Mul_end, Time = 0.000056 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.085890 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.085912 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352816.085926 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352816.085985 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000059 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 89.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 132.903743, current iteration energy = 0.000000 + +RUNNING BATCH = 4 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.163189 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.163466 +INFO: TimeDuration, Event = Add_end, Time = 0.000277 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.163680 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.163929 +INFO: TimeDuration, Event = Relu_end, Time = 0.000250 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.171502 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.171756 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.171783 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.172024 +INFO: TimeDuration, Event = Relu_end, Time = 0.000240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.185665 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.185953 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.185976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.186332 +INFO: TimeDuration, Event = Add_end, Time = 0.000355 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.186355 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.186614 +INFO: TimeDuration, Event = Relu_end, Time = 0.000259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.199695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.199942 +INFO: TimeDuration, Event = Add_end, Time = 0.000248 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.199960 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.200196 +INFO: TimeDuration, Event = Relu_end, Time = 0.000236 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.213151 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.213401 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.213423 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.213771 +INFO: TimeDuration, Event = Add_end, Time = 0.000348 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.213790 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.214023 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.227186 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.227441 +INFO: TimeDuration, Event = Add_end, Time = 0.000254 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.227459 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.227693 +INFO: TimeDuration, Event = Relu_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.240713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.240963 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.240985 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.241333 +INFO: TimeDuration, Event = Add_end, Time = 0.000348 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.241353 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.241585 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.249710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.249873 +INFO: TimeDuration, Event = Add_end, Time = 0.000164 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.249892 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.250022 +INFO: TimeDuration, Event = Relu_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.252580 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.252713 +INFO: TimeDuration, Event = Add_end, Time = 0.000133 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.255258 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.255401 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.255422 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.255633 +INFO: TimeDuration, Event = Add_end, Time = 0.000211 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.255653 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.255782 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.261286 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.261431 +INFO: TimeDuration, Event = Add_end, Time = 0.000145 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.261448 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.261579 +INFO: TimeDuration, Event = Relu_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.267300 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.267444 +INFO: TimeDuration, Event = Add_end, Time = 0.000145 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.267465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.267673 +INFO: TimeDuration, Event = Add_end, Time = 0.000207 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.267691 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.267820 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.273337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.273481 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.273499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.273628 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.279356 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.279498 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.279519 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.279725 +INFO: TimeDuration, Event = Add_end, Time = 0.000206 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.279744 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.279873 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.284052 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.284142 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.284161 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.284239 +INFO: TimeDuration, Event = Relu_end, Time = 0.000078 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.285640 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.285727 +INFO: TimeDuration, Event = Add_end, Time = 0.000086 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.287339 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.287428 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.287449 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.287555 +INFO: TimeDuration, Event = Add_end, Time = 0.000106 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.287575 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.287650 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.290734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.290824 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.290841 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.290919 +INFO: TimeDuration, Event = Relu_end, Time = 0.000077 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.294109 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.294199 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.294220 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.294325 +INFO: TimeDuration, Event = Add_end, Time = 0.000105 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.294343 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.294418 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.297489 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.297579 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.297609 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.297687 +INFO: TimeDuration, Event = Relu_end, Time = 0.000078 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.300882 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.300975 +INFO: TimeDuration, Event = Add_end, Time = 0.000093 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.300996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.301104 +INFO: TimeDuration, Event = Add_end, Time = 0.000108 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.301124 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.301198 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352816.301218 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352816.301361 +INFO: TimeDuration, Event = Pool_end, Time = 0.000143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352816.301383 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352816.301454 +INFO: TimeDuration, Event = Mul_end, Time = 0.000070 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.301473 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.301518 +INFO: TimeDuration, Event = Add_end, Time = 0.000044 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352816.301538 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352816.301638 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000101 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 89.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.905377, current iteration energy = 0.000000 + +RUNNING BATCH = 5 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.376458 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.376735 +INFO: TimeDuration, Event = Add_end, Time = 0.000277 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.376767 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.377015 +INFO: TimeDuration, Event = Relu_end, Time = 0.000247 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.384903 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.385163 +INFO: TimeDuration, Event = Add_end, Time = 0.000260 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.385193 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.385437 +INFO: TimeDuration, Event = Relu_end, Time = 0.000243 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.398982 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.399231 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.399254 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.399603 +INFO: TimeDuration, Event = Add_end, Time = 0.000349 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.399625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.399857 +INFO: TimeDuration, Event = Relu_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.412980 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.413231 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.413248 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.413486 +INFO: TimeDuration, Event = Relu_end, Time = 0.000238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.426465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.426721 +INFO: TimeDuration, Event = Add_end, Time = 0.000255 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.426745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.427090 +INFO: TimeDuration, Event = Add_end, Time = 0.000345 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.427109 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.427343 +INFO: TimeDuration, Event = Relu_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.440505 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.440754 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.440772 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.441008 +INFO: TimeDuration, Event = Relu_end, Time = 0.000236 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.454019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.454267 +INFO: TimeDuration, Event = Add_end, Time = 0.000248 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.454289 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.454635 +INFO: TimeDuration, Event = Add_end, Time = 0.000346 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.454654 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.454887 +INFO: TimeDuration, Event = Relu_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.462969 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.463118 +INFO: TimeDuration, Event = Add_end, Time = 0.000149 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.463136 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.463266 +INFO: TimeDuration, Event = Relu_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.465735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.465878 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.468596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.468738 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.468759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.468969 +INFO: TimeDuration, Event = Add_end, Time = 0.000209 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.468987 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.469117 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.474584 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.474729 +INFO: TimeDuration, Event = Add_end, Time = 0.000145 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.474748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.474880 +INFO: TimeDuration, Event = Relu_end, Time = 0.000133 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.480639 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.480799 +INFO: TimeDuration, Event = Add_end, Time = 0.000160 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.480835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.481037 +INFO: TimeDuration, Event = Add_end, Time = 0.000202 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.481056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.481187 +INFO: TimeDuration, Event = Relu_end, Time = 0.000131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.486668 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.486812 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.486830 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.486960 +INFO: TimeDuration, Event = Relu_end, Time = 0.000130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.492682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.492824 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.492845 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.493052 +INFO: TimeDuration, Event = Add_end, Time = 0.000206 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.493070 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.493202 +INFO: TimeDuration, Event = Relu_end, Time = 0.000132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.497369 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.497458 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.497478 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.497557 +INFO: TimeDuration, Event = Relu_end, Time = 0.000080 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.498950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.499038 +INFO: TimeDuration, Event = Add_end, Time = 0.000088 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.500675 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.500765 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.500786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.500894 +INFO: TimeDuration, Event = Add_end, Time = 0.000108 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.500913 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.500990 +INFO: TimeDuration, Event = Relu_end, Time = 0.000076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.504032 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.504121 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.504139 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.504216 +INFO: TimeDuration, Event = Relu_end, Time = 0.000077 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.507411 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.507502 +INFO: TimeDuration, Event = Add_end, Time = 0.000091 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.507524 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.507630 +INFO: TimeDuration, Event = Add_end, Time = 0.000105 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.507649 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.507725 +INFO: TimeDuration, Event = Relu_end, Time = 0.000076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.510780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.510906 +INFO: TimeDuration, Event = Add_end, Time = 0.000126 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.510936 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.511014 +INFO: TimeDuration, Event = Relu_end, Time = 0.000078 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.514185 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.514274 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.514295 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.514400 +INFO: TimeDuration, Event = Add_end, Time = 0.000105 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.514418 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.514495 +INFO: TimeDuration, Event = Relu_end, Time = 0.000077 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352816.514516 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352816.514669 +INFO: TimeDuration, Event = Pool_end, Time = 0.000153 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352816.514692 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352816.514762 +INFO: TimeDuration, Event = Mul_end, Time = 0.000071 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.514783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.514814 +INFO: TimeDuration, Event = Add_end, Time = 0.000031 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352816.514833 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352816.514917 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 89.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.701297, current iteration energy = 0.000000 + +RUNNING BATCH = 6 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.593154 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.593416 +INFO: TimeDuration, Event = Add_end, Time = 0.000262 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.593589 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.593823 +INFO: TimeDuration, Event = Relu_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.600975 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.601224 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.601244 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.601477 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.614707 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.614954 +INFO: TimeDuration, Event = Add_end, Time = 0.000247 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.614973 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.615319 +INFO: TimeDuration, Event = Add_end, Time = 0.000346 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.615340 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.615568 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.628388 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.628637 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.628653 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.628884 +INFO: TimeDuration, Event = Relu_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.641713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.641964 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.641984 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.642329 +INFO: TimeDuration, Event = Add_end, Time = 0.000345 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.642345 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.642574 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.655432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.655698 +INFO: TimeDuration, Event = Add_end, Time = 0.000266 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.655715 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.655948 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.668737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.668983 +INFO: TimeDuration, Event = Add_end, Time = 0.000247 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.669004 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.669347 +INFO: TimeDuration, Event = Add_end, Time = 0.000343 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.669363 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.669590 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.677572 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.677719 +INFO: TimeDuration, Event = Add_end, Time = 0.000147 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.677735 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.677862 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.680283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.680423 +INFO: TimeDuration, Event = Add_end, Time = 0.000141 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.682933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.683075 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.683095 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.683304 +INFO: TimeDuration, Event = Add_end, Time = 0.000209 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.683323 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.683450 +INFO: TimeDuration, Event = Relu_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.688777 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.688920 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.688936 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.689064 +INFO: TimeDuration, Event = Relu_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.694617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.694759 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.694779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.694982 +INFO: TimeDuration, Event = Add_end, Time = 0.000203 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.694998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.695124 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.700466 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.700609 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.700626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.700752 +INFO: TimeDuration, Event = Relu_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.706319 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.706461 +INFO: TimeDuration, Event = Add_end, Time = 0.000143 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.706481 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.706683 +INFO: TimeDuration, Event = Add_end, Time = 0.000203 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.706700 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.706827 +INFO: TimeDuration, Event = Relu_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.710925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.711013 +INFO: TimeDuration, Event = Add_end, Time = 0.000087 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.711029 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.711104 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.712487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.712574 +INFO: TimeDuration, Event = Add_end, Time = 0.000087 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.714122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.714210 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.714268 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.714372 +INFO: TimeDuration, Event = Add_end, Time = 0.000103 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.714387 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.714462 +INFO: TimeDuration, Event = Relu_end, Time = 0.000074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.717422 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.717511 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.717527 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.717601 +INFO: TimeDuration, Event = Relu_end, Time = 0.000074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.720704 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.720792 +INFO: TimeDuration, Event = Add_end, Time = 0.000088 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.720810 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.720913 +INFO: TimeDuration, Event = Add_end, Time = 0.000103 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.720930 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.721004 +INFO: TimeDuration, Event = Relu_end, Time = 0.000074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.723986 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.724076 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.724103 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.724179 +INFO: TimeDuration, Event = Relu_end, Time = 0.000076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.727262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.727350 +INFO: TimeDuration, Event = Add_end, Time = 0.000088 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.727369 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.727472 +INFO: TimeDuration, Event = Add_end, Time = 0.000103 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.727488 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.727562 +INFO: TimeDuration, Event = Relu_end, Time = 0.000074 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352816.727580 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352816.727726 +INFO: TimeDuration, Event = Pool_end, Time = 0.000146 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352816.727747 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352816.727811 +INFO: TimeDuration, Event = Mul_end, Time = 0.000064 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.727833 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.727857 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352816.727875 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352816.727955 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000080 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 90.400002 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 136.983737, current iteration energy = 0.000000 + +RUNNING BATCH = 7 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.805762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.806025 +INFO: TimeDuration, Event = Add_end, Time = 0.000263 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.806118 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.806354 +INFO: TimeDuration, Event = Relu_end, Time = 0.000235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.813730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.813985 +INFO: TimeDuration, Event = Add_end, Time = 0.000255 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.814013 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.814250 +INFO: TimeDuration, Event = Relu_end, Time = 0.000238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.827455 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.827701 +INFO: TimeDuration, Event = Add_end, Time = 0.000246 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.827722 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.828068 +INFO: TimeDuration, Event = Add_end, Time = 0.000347 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.828091 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.828321 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.841160 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.841407 +INFO: TimeDuration, Event = Add_end, Time = 0.000247 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.841426 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.841655 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.854430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.854674 +INFO: TimeDuration, Event = Add_end, Time = 0.000244 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.854694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.855037 +INFO: TimeDuration, Event = Add_end, Time = 0.000344 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.855057 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.855285 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.868159 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.868407 +INFO: TimeDuration, Event = Add_end, Time = 0.000248 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.868422 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.868652 +INFO: TimeDuration, Event = Relu_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.881487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.881733 +INFO: TimeDuration, Event = Add_end, Time = 0.000245 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.881753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.882101 +INFO: TimeDuration, Event = Add_end, Time = 0.000347 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.882120 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.882350 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.890299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.890444 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.890463 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.890590 +INFO: TimeDuration, Event = Relu_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.893031 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.893173 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.895589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.895733 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.895753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.895962 +INFO: TimeDuration, Event = Add_end, Time = 0.000209 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.895983 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.896108 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.901458 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.901603 +INFO: TimeDuration, Event = Add_end, Time = 0.000145 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.901619 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.901747 +INFO: TimeDuration, Event = Relu_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.907319 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.907463 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.907482 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.907687 +INFO: TimeDuration, Event = Add_end, Time = 0.000205 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.907704 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.907830 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.913182 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.913327 +INFO: TimeDuration, Event = Add_end, Time = 0.000145 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.913343 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.913471 +INFO: TimeDuration, Event = Relu_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.919096 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.919234 +INFO: TimeDuration, Event = Add_end, Time = 0.000137 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.919251 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.919451 +INFO: TimeDuration, Event = Add_end, Time = 0.000200 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.919466 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.919588 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.923611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.923693 +INFO: TimeDuration, Event = Add_end, Time = 0.000082 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.923706 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.923777 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.925089 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.925171 +INFO: TimeDuration, Event = Add_end, Time = 0.000082 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.926555 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.926638 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.926655 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.926756 +INFO: TimeDuration, Event = Add_end, Time = 0.000102 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.926770 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.926839 +INFO: TimeDuration, Event = Relu_end, Time = 0.000069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.929716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.929801 +INFO: TimeDuration, Event = Add_end, Time = 0.000085 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.929836 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.929907 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.932905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.932990 +INFO: TimeDuration, Event = Add_end, Time = 0.000085 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.933019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.933119 +INFO: TimeDuration, Event = Add_end, Time = 0.000100 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.933133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.933203 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.936050 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.936134 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.936158 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.936230 +INFO: TimeDuration, Event = Relu_end, Time = 0.000071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.939204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.939289 +INFO: TimeDuration, Event = Add_end, Time = 0.000084 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.939304 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.939407 +INFO: TimeDuration, Event = Add_end, Time = 0.000103 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352816.939422 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352816.939492 +INFO: TimeDuration, Event = Relu_end, Time = 0.000070 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352816.939508 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352816.939628 +INFO: TimeDuration, Event = Pool_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352816.939644 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352816.939691 +INFO: TimeDuration, Event = Mul_end, Time = 0.000046 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352816.939704 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352816.939726 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352816.939739 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352816.939788 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000049 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 89.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 135.875292, current iteration energy = 0.000000 + +RUNNING BATCH = 8 +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 1 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.018281 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.018551 +INFO: TimeDuration, Event = Add_end, Time = 0.000271 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.018574 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.018815 +INFO: TimeDuration, Event = Relu_end, Time = 0.000241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 4 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.029168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.029419 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.029441 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.029674 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 7 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.043283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.043535 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 9 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.043556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.043905 +INFO: TimeDuration, Event = Add_end, Time = 0.000349 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 10 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.043929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.044161 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 11 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.057293 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.057545 +INFO: TimeDuration, Event = Add_end, Time = 0.000252 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.057562 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.057796 +INFO: TimeDuration, Event = Relu_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 14 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.070776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.071024 +INFO: TimeDuration, Event = Add_end, Time = 0.000249 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 16 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.071046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.071394 +INFO: TimeDuration, Event = Add_end, Time = 0.000348 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 17 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.071413 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.071645 +INFO: TimeDuration, Event = Relu_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 18 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.085157 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.085408 +INFO: TimeDuration, Event = Add_end, Time = 0.000251 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.085426 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.085660 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 21 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.098678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 16 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.098929 +INFO: TimeDuration, Event = Add_end, Time = 0.000250 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 23 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.098951 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 8192000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.099300 +INFO: TimeDuration, Event = Add_end, Time = 0.000349 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 24 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.099318 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.099552 +INFO: TimeDuration, Event = Relu_end, Time = 0.000234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 25 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.107628 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.107776 +INFO: TimeDuration, Event = Add_end, Time = 0.000148 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.107793 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.107922 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 30 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.110387 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.110529 +INFO: TimeDuration, Event = Add_end, Time = 0.000142 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 28 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.113050 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.113194 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 32 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.113215 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.113426 +INFO: TimeDuration, Event = Add_end, Time = 0.000211 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 33 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.113446 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.113615 +INFO: TimeDuration, Event = Relu_end, Time = 0.000169 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 34 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.119072 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.119216 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.119234 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.119363 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 37 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.125086 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.125230 +INFO: TimeDuration, Event = Add_end, Time = 0.000144 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 39 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.125281 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.125486 +INFO: TimeDuration, Event = Add_end, Time = 0.000205 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 40 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.125505 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.125632 +INFO: TimeDuration, Event = Relu_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 41 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.131114 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.131260 +INFO: TimeDuration, Event = Add_end, Time = 0.000146 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.131279 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.131410 +INFO: TimeDuration, Event = Relu_end, Time = 0.000131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 44 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.137131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 32 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.137280 +INFO: TimeDuration, Event = Add_end, Time = 0.000148 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 46 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.137301 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 4096000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.137506 +INFO: TimeDuration, Event = Add_end, Time = 0.000205 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 47 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.137524 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.137653 +INFO: TimeDuration, Event = Relu_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 48 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.141942 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.142028 +INFO: TimeDuration, Event = Add_end, Time = 0.000087 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.142046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.142123 +INFO: TimeDuration, Event = Relu_end, Time = 0.000078 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 53 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.143514 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.143601 +INFO: TimeDuration, Event = Add_end, Time = 0.000087 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 51 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.145221 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.145310 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 55 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.145331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.145437 +INFO: TimeDuration, Event = Add_end, Time = 0.000106 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 56 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.145455 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.145530 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 57 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.148604 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.148694 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.148712 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.148788 +INFO: TimeDuration, Event = Relu_end, Time = 0.000076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 60 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.151967 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.152057 +INFO: TimeDuration, Event = Add_end, Time = 0.000090 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 62 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.152077 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.152183 +INFO: TimeDuration, Event = Add_end, Time = 0.000106 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 63 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.152201 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.152279 +INFO: TimeDuration, Event = Relu_end, Time = 0.000078 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 64 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.155333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.155424 +INFO: TimeDuration, Event = Add_end, Time = 0.000091 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.155452 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.155531 +INFO: TimeDuration, Event = Relu_end, Time = 0.000079 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 67 +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.158742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.158831 +INFO: TimeDuration, Event = Add_end, Time = 0.000089 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 69 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.158851 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 2048000 +INFO: bias->num_elems = 2048000 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.158959 +INFO: TimeDuration, Event = Add_end, Time = 0.000108 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 70 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352817.158977 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352817.159051 +INFO: TimeDuration, Event = Relu_end, Time = 0.000075 +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 71 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352817.159072 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 1, w = 1 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 128000 +DEBUG: Attempting to Allocate = 128000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352817.159217 +INFO: TimeDuration, Event = Pool_end, Time = 0.000145 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 72 +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352817.159240 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 10, k = 64 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 20000 +DEBUG: Attempting to Allocate = 20000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352817.159298 +INFO: TimeDuration, Event = Mul_end, Time = 0.000058 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352817.159316 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 5000 +INFO: bias->num_elems = 10 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352817.159344 +INFO: TimeDuration, Event = Add_end, Time = 0.000028 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +DEBUG: -- currentTensorID = 74 +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352817.159363 +DEBUG: No data movement required - Data on Device +INFO: Moving 20000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352817.159418 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000055 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 10 +****** Accuracy = 88.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 143.473057, current iteration energy = 0.000000 + +RUNNING BATCH = 9 +DEBUG: **** Freeing Ouput Tensors *** +Exiting profiler +INFO: Writing Runtime Profile Info File... +INFO: Done writing profile. diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..d5d1415885cd728c3f227597d46bffd4446ea4b3 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/out-run-1 @@ -0,0 +1 @@ +run_dnn_frequency_exp.sh: line 28: ./resnet18_loop_wrapperapi_linked: No such file or directory diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/predictive/resnet18.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/predictive/resnet18.txt new file mode 100644 index 0000000000000000000000000000000000000000..654cffbf632686dca6310a93ecf56b6521e32039 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/predictive/resnet18.txt @@ -0,0 +1,2296 @@ +2484.981244 ++++++ +conf1 1 1 89.56 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 +3 gpu conv fp32 11 add fp32 1 +4 gpu add fp32 11 +5 gpu relu fp32 11 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 +8 gpu add fp32 11 +9 gpu relu fp32 11 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 +11 gpu conv fp32 11 add fp32 1 +12 gpu add fp32 11 +13 gpu relu fp32 11 +14 gpu conv fp32 11 add fp32 1 relu fp32 1 +15 gpu conv fp32 11 add fp32 1 +16 gpu conv fp32 11 add fp32 1 +17 gpu add fp32 11 +18 gpu relu fp32 11 +19 gpu conv fp32 11 add fp32 1 relu fp32 1 +20 gpu conv fp32 11 add fp32 1 +21 gpu add fp32 11 +22 gpu relu fp32 11 +23 gpu conv fp32 11 add fp32 1 relu fp32 1 +24 gpu conv fp32 11 add fp32 1 +25 gpu add fp32 11 +26 gpu relu fp32 11 +27 gpu conv fp32 11 add fp32 1 relu fp32 1 +28 gpu conv fp32 11 add fp32 1 +29 gpu conv fp32 11 add fp32 1 +30 gpu add fp32 11 +31 gpu relu fp32 11 +32 gpu conv fp32 11 add fp32 1 relu fp32 1 +33 gpu conv fp32 11 add fp32 1 +34 gpu add fp32 11 +35 gpu relu fp32 11 +36 gpu conv fp32 11 add fp32 1 relu fp32 1 +37 gpu conv fp32 11 add fp32 1 +38 gpu add fp32 11 +39 gpu relu fp32 11 +40 gpu pool_mean fp32 11 +41 gpu mul fp32 11 add fp32 1 +42 gpu softmax fp32 1 +----- ++++++ +conf2 1.767527790869615 1.7962938589450996 88.96 0.6000000000000085 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf3 1.7676486174436143 1.7967155014984917 88.78 0.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf4 1.7674352647250422 1.792910560846682 88.7 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf5 1.8655703338511067 1.8930089896922888 88.53999999999999 1.0200000000000102 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 167 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 159 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 157 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf6 1.9070428103729684 1.9172857853336078 88.38000000000001 1.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv samp_fp16 261 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf7 1.769778590701739 1.7956222622694236 88.24 1.3200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv fp16 12 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 268 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf8 1.841404652091802 1.8677947628418006 88.24 1.3200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 162 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf9 1.8679349428783487 1.8995927920729931 88.22 1.3400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 160 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 161 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf10 1.876937310100899 1.9041581451399825 88.1 1.460000000000008 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf11 1.842140004857965 1.8673692956620238 88.06 1.5 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 167 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf12 1.9070567138857761 1.9165525910492667 88.02 1.5400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 261 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf13 1.9185835698271805 1.9328202469403 87.98 1.5799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf14 1.781744853993609 1.8082995958456516 87.92 1.6400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 168 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 159 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 265 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv samp_fp16 268 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf15 1.9185835698271805 1.9328202469403 87.92 1.6400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 12 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf16 1.875261840315855 1.8986912653657988 87.88 1.6800000000000068 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 159 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 12 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf17 1.9013559086026153 1.9230901214481015 87.86 1.7000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf18 1.9185835698271805 1.9328202469403 87.83999999999999 1.720000000000013 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv samp_fp16 266 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 152 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 152 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf19 1.8770503055325798 1.9007923328014182 87.82 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 151 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf20 1.8774136276932418 1.90365663123621 87.82 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf21 1.943143041264842 1.9591958561422729 87.82 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf22 1.870789918969847 1.8863625217899933 87.8 1.7600000000000051 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 264 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf23 1.7445941809066292 1.7754934270309912 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv perf_fp16 166 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf24 1.9065930313550916 1.928938946228637 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 167 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf25 1.9021824494907031 1.9237134505552098 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 154 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf26 1.9017271009017505 1.9211078231701697 87.78 1.7800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf27 1.8187224917656395 1.820406007609536 87.76 1.7999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf28 1.9070855899343322 1.9285210655709735 87.76 1.7999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 268 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf29 1.9013559086026153 1.9230901214481015 87.74 1.8200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf30 1.8772990284718367 1.9022146647342513 87.72 1.8400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf31 1.9013559086026153 1.9230901214481015 87.68 1.8799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf32 1.9020502478364545 1.923319572598976 87.66000000000001 1.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf33 1.7516394053514481 1.7809034526471939 87.62 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv perf_fp16 166 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf34 1.7814953252955337 1.8122658147993431 87.62 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 162 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv perf_fp16 160 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 155 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv fp16 12 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 160 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv perf_fp16 166 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 155 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf35 1.887538247557846 1.9103369445911678 87.62 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 158 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf36 1.9107566783735581 1.9273803227885578 87.6 1.960000000000008 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 157 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf37 1.9013559086026153 1.9230901214481015 87.58 1.980000000000004 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 12 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf38 1.8984089819969947 1.9195632881772446 87.58 1.980000000000004 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf39 1.9020502478364545 1.923319572598976 87.52 2.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf40 1.9020502478364545 1.923319572598976 87.52 2.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf41 1.9013559086026153 1.9230901214481015 87.5 2.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf42 1.9013559086026153 1.9230901214481015 87.46000000000001 2.0999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv fp16 11 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf43 1.9196179152539186 1.9443459719929068 87.44 2.1200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 153 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf44 1.9020502478364545 1.923319572598976 87.4 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf45 1.9152817031040366 1.9357432559063958 87.4 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf46 1.915754791147898 1.9373322475753219 87.4 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf47 1.9130551004051772 1.9409232417921056 87.38 2.180000000000007 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv perf_fp16 153 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf48 1.9421147660673033 1.9584555432766413 87.38 2.180000000000007 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf49 1.9052849920081363 1.9300100333661123 87.32 2.240000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 153 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf50 1.9154322863033566 1.934908329027621 87.3 2.260000000000005 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv perf_fp16 151 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- ++++++ +conf51 1.9079703554020564 1.9287218218306195 86.96000000000001 2.5999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 +3 gpu conv fp16 12 add fp16 1 +4 gpu add fp16 12 +5 gpu relu fp16 12 +6 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 161 add fp16 1 +8 gpu add fp16 12 +9 gpu relu fp16 12 +10 gpu conv perf_fp16 154 add fp16 1 relu fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 +12 gpu add fp16 12 +13 gpu relu fp16 12 +14 gpu conv fp16 12 add fp16 1 relu fp16 1 +15 gpu conv fp16 12 add fp16 1 +16 gpu conv fp16 11 add fp16 1 +17 gpu add fp16 12 +18 gpu relu fp16 12 +19 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +20 gpu conv samp_fp16 262 add fp16 1 +21 gpu add fp16 12 +22 gpu relu fp16 12 +23 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 +24 gpu conv perf_fp16 153 add fp16 1 +25 gpu add fp16 12 +26 gpu relu fp16 12 +27 gpu conv fp16 12 add fp16 1 relu fp16 1 +28 gpu conv fp16 12 add fp16 1 +29 gpu conv samp_fp16 261 add fp16 1 +30 gpu add fp16 12 +31 gpu relu fp16 12 +32 gpu conv fp16 12 add fp16 1 relu fp16 1 +33 gpu conv fp16 12 add fp16 1 +34 gpu add fp16 12 +35 gpu relu fp16 12 +36 gpu conv fp16 12 add fp16 1 relu fp16 1 +37 gpu conv perf_fp16 152 add fp16 1 +38 gpu add fp16 12 +39 gpu relu fp16 12 +40 gpu pool_mean fp16 12 +41 gpu mul fp16 12 add fp16 1 +42 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/src/resnet18_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/src/resnet18_loop.cpp index 9ca1c627334fa3005c602ee96df63adbca8c969d..556f499a81f5516b2161c3cbc010f54d62c01c99 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/src/resnet18_loop.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet18/src/resnet18_loop.cpp @@ -13,6 +13,8 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(1); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -21,6 +23,8 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(2); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -29,6 +33,8 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_2_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(3); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -37,6 +43,8 @@ void var_2_node(void* t1, size_t bytes_t1) { void var_3_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(4); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -45,6 +53,8 @@ void var_3_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(5); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -53,6 +63,8 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_5_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(6); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -61,6 +73,8 @@ void var_5_node(void* t1, size_t bytes_t1) { void var_6_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(7); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -69,6 +83,8 @@ void var_6_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(8); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -77,6 +93,8 @@ void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(9); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -85,6 +103,8 @@ void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_9_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(10); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -93,6 +113,8 @@ void var_9_node(void* t1, size_t bytes_t1) { void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(11); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -101,6 +123,8 @@ void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(12); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -109,6 +133,8 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_12_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(13); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -117,6 +143,8 @@ void var_12_node(void* t1, size_t bytes_t1) { void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(14); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -125,6 +153,8 @@ void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(15); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -133,6 +163,8 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(16); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -141,6 +173,8 @@ void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_16_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(17); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -149,6 +183,8 @@ void var_16_node(void* t1, size_t bytes_t1) { void var_17_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(18); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -157,6 +193,8 @@ void var_17_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(19); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -165,6 +203,8 @@ void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_19_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(20); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -173,6 +213,8 @@ void var_19_node(void* t1, size_t bytes_t1) { void var_20_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(21); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -181,6 +223,8 @@ void var_20_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(22); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -189,6 +233,8 @@ void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(23); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -197,6 +243,8 @@ void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_23_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(24); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -205,6 +253,8 @@ void var_23_node(void* t1, size_t bytes_t1) { void var_24_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(25); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 2, 2); __visc__return(2, r, (size_t) 0); @@ -213,6 +263,8 @@ void var_24_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(26); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -221,6 +273,8 @@ void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_26_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(27); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -229,6 +283,8 @@ void var_26_node(void* t1, size_t bytes_t1) { void var_27_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(28); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -237,6 +293,8 @@ void var_27_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_28_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(29); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -245,6 +303,8 @@ void var_28_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_29_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(30); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -253,6 +313,8 @@ void var_29_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_30_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(31); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -261,6 +323,8 @@ void var_30_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_31_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(32); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -269,6 +333,8 @@ void var_31_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_32_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(33); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -277,6 +343,8 @@ void var_32_node(void* t1, size_t bytes_t1) { void var_33_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(34); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -285,6 +353,8 @@ void var_33_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_34_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(35); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -293,6 +363,8 @@ void var_34_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_35_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(36); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -301,6 +373,8 @@ void var_35_node(void* t1, size_t bytes_t1) { void var_36_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(37); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -309,6 +383,8 @@ void var_36_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_37_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(38); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -317,6 +393,8 @@ void var_37_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_38_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(39); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -325,6 +403,8 @@ void var_38_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_39_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(40); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -333,6 +413,8 @@ void var_39_node(void* t1, size_t bytes_t1) { void var_40_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(41); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -341,6 +423,8 @@ void var_40_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_41_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(42); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -349,6 +433,8 @@ void var_41_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_42_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(43); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -357,6 +443,8 @@ void var_42_node(void* t1, size_t bytes_t1) { void var_43_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(44); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -365,6 +453,8 @@ void var_43_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_44_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(45); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -373,6 +463,8 @@ void var_44_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_45_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(46); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -381,6 +473,8 @@ void var_45_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_46_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(47); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -389,6 +483,8 @@ void var_46_node(void* t1, size_t bytes_t1) { void var_47_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(48); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 2, 2); __visc__return(2, r, (size_t) 0); @@ -397,6 +493,8 @@ void var_47_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_48_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(49); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -405,6 +503,8 @@ void var_48_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_49_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(50); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -413,6 +513,8 @@ void var_49_node(void* t1, size_t bytes_t1) { void var_50_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(51); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -421,6 +523,8 @@ void var_50_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_51_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(52); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -429,6 +533,8 @@ void var_51_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_52_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(53); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -437,6 +543,8 @@ void var_52_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_53_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(54); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -445,6 +553,8 @@ void var_53_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_54_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(55); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -453,6 +563,8 @@ void var_54_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_55_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(56); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -461,6 +573,8 @@ void var_55_node(void* t1, size_t bytes_t1) { void var_56_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(57); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -469,6 +583,8 @@ void var_56_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_57_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(58); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -477,6 +593,8 @@ void var_57_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_58_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(59); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -485,6 +603,8 @@ void var_58_node(void* t1, size_t bytes_t1) { void var_59_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(60); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -493,6 +613,8 @@ void var_59_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_60_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(61); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -501,6 +623,8 @@ void var_60_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_61_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(62); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -509,6 +633,8 @@ void var_61_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_62_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(63); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -517,6 +643,8 @@ void var_62_node(void* t1, size_t bytes_t1) { void var_63_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(64); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -525,6 +653,8 @@ void var_63_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_64_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(65); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -533,6 +663,8 @@ void var_64_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_65_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(66); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -541,6 +673,8 @@ void var_65_node(void* t1, size_t bytes_t1) { void var_66_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(67); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -549,6 +683,8 @@ void var_66_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_67_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(68); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -557,6 +693,8 @@ void var_67_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_68_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(69); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -565,6 +703,8 @@ void var_68_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_69_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(70); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -573,6 +713,8 @@ void var_69_node(void* t1, size_t bytes_t1) { void var_70_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(71); + void* r = __visc__tensor_pool_mean(t1, 8, 8, 0, 0, 8, 8); __visc__return(2, r, (size_t) 0); @@ -581,6 +723,8 @@ void var_70_node(void* t1, size_t bytes_t1) { void var_71_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(72); + void *r = __visc__tensor_mul(t1, t2); __visc__return(2, r, (size_t) 0); @@ -589,6 +733,8 @@ void var_71_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_72_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(73); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -597,6 +743,8 @@ void var_72_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_73_node(void* t1, size_t bytes_t1) { __visc__hint(visc::CUDNN_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(74); + void* r = __visc__tensor_softmax(t1); __visc__return(2, r, (size_t) 0); @@ -652,6 +800,7 @@ void root(void* input, size_t input_bytes, __visc__hint(visc::CPU_TARGET); __visc__attributes(45, input, conv2d_1_w, conv2d_1_b, conv2d_2_w, conv2d_2_b, conv2d_3_w, conv2d_3_b, conv2d_4_w, conv2d_4_b, conv2d_5_w, conv2d_5_b, conv2d_6_w, conv2d_6_b, conv2d_7_w, conv2d_7_b, conv2d_8_w, conv2d_8_b, conv2d_10_w, conv2d_10_b, conv2d_9_w, conv2d_9_b, conv2d_11_w, conv2d_11_b, conv2d_12_w, conv2d_12_b, conv2d_13_w, conv2d_13_b, conv2d_14_w, conv2d_14_b, conv2d_15_w, conv2d_15_b, conv2d_17_w, conv2d_17_b, conv2d_16_w, conv2d_16_b, conv2d_18_w, conv2d_18_b, conv2d_19_w, conv2d_19_b, conv2d_20_w, conv2d_20_b, conv2d_21_w, conv2d_21_b, dense_1_w, dense_1_b, 0); + void* var_0 = __visc__createNodeND(0, var_0_node); @@ -1436,7 +1585,7 @@ int main(){ startMemTracking(); startProfiling(); - for (int j = 0; j < 14; j++){ + for (int j = 0; j < 1; j++){ for (int i = 0; i < batch_count; i++){ int start = i * batch_size; diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/Makefile index 609901a4e2cce631af817a26bc75765d18b94208..5fb4534594a1aef9077f74f53ffe06281bc2b23a 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/Makefile @@ -1,5 +1,6 @@ - DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks +# NOTE: can configure build directory +#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/ HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT) CC = $(HPVM_BUILD_DIR)/bin/clang++ @@ -36,10 +37,13 @@ VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LL WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt - VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp @@ -52,25 +56,32 @@ default: $(BUILD_DIR) $(TARGET) $(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp $(CC) $(CC_FLAGS) -emit-llvm src/$(APP).cpp -S -o $(BUILD_DIR)/$(APP).ll - #-- $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll + $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_loop.cpp -S -o $(BUILD_DIR)/$(APP)_loop.ll + $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP).ll -S -o $(BUILD_DIR)/$(APP).visc.ll - #-- $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll + $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc - #-- $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc + #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc - + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc - #-- $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc + #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - #- $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) - - + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/data/tuner_confs.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/data/tuner_confs.txt index b0814ec4b32b2840a8af2fea7329af9b0769f6df..ede27ce6f5952d4d1be47640a46771d1f4c51ab2 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/data/tuner_confs.txt +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/data/tuner_confs.txt @@ -4,15 +4,9 @@ conf1 1 1 75.7 0.0 1 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 2 gpu batchnorm fp32 11 3 gpu conv fp32 11 add fp32 1 -4 gpu conv fp32 11 add fp32 1 -5 gpu batchnorm fp32 11 -6 gpu batchnorm fp32 11 -7 gpu relu fp32 11 -8 gpu conv fp32 11 add fp32 1 -9 gpu batchnorm fp32 11 -10 gpu relu fp32 11 -11 gpu conv fp32 11 add fp32 1 -12 gpu batchnorm fp32 11 +4 gpu batchnorm fp32 11 +5 gpu relu fp32 11 +6 gpu conv fp32 11 add fp32 1 7 gpu batchnorm fp32 11 8 gpu relu fp32 11 9 gpu conv fp32 11 add fp32 1 diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/predictive/resnet50_imagenet.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/predictive/resnet50_imagenet.txt new file mode 100644 index 0000000000000000000000000000000000000000..094eed413b520f9dd661797b96735438861d1c08 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/predictive/resnet50_imagenet.txt @@ -0,0 +1,1057 @@ +7161.053769000008 ++++++ +conf1 1 1 75.7 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +2 gpu batchnorm fp32 11 +3 gpu conv fp32 11 add fp32 1 +4 gpu batchnorm fp32 11 +5 gpu relu fp32 11 +6 gpu conv fp32 11 add fp32 1 +7 gpu batchnorm fp32 11 +8 gpu relu fp32 11 +9 gpu conv fp32 11 add fp32 1 +10 gpu batchnorm fp32 11 +11 gpu conv fp32 11 add fp32 1 +12 gpu batchnorm fp32 11 +13 gpu add fp32 11 +14 gpu relu fp32 11 +15 gpu conv fp32 11 add fp32 1 +16 gpu batchnorm fp32 11 +17 gpu relu fp32 11 +18 gpu conv fp32 11 add fp32 1 +19 gpu batchnorm fp32 11 +20 gpu relu fp32 11 +21 gpu conv fp32 11 add fp32 1 +22 gpu batchnorm fp32 11 +23 gpu add fp32 11 +24 gpu relu fp32 11 +25 gpu conv fp32 11 add fp32 1 +26 gpu batchnorm fp32 11 +27 gpu relu fp32 11 +28 gpu conv fp32 11 add fp32 1 +29 gpu batchnorm fp32 11 +30 gpu relu fp32 11 +31 gpu conv fp32 11 add fp32 1 +32 gpu batchnorm fp32 11 +33 gpu add fp32 11 +34 gpu relu fp32 11 +35 gpu conv fp32 11 add fp32 1 +36 gpu batchnorm fp32 11 +37 gpu relu fp32 11 +38 gpu conv fp32 11 add fp32 1 +39 gpu batchnorm fp32 11 +40 gpu relu fp32 11 +41 gpu conv fp32 11 add fp32 1 +42 gpu batchnorm fp32 11 +43 gpu conv fp32 11 add fp32 1 +44 gpu batchnorm fp32 11 +45 gpu add fp32 11 +46 gpu relu fp32 11 +47 gpu conv fp32 11 add fp32 1 +48 gpu batchnorm fp32 11 +49 gpu relu fp32 11 +50 gpu conv fp32 11 add fp32 1 +51 gpu batchnorm fp32 11 +52 gpu relu fp32 11 +53 gpu conv fp32 11 add fp32 1 +54 gpu batchnorm fp32 11 +55 gpu add fp32 11 +56 gpu relu fp32 11 +57 gpu conv fp32 11 add fp32 1 +58 gpu batchnorm fp32 11 +59 gpu relu fp32 11 +60 gpu conv fp32 11 add fp32 1 +61 gpu batchnorm fp32 11 +62 gpu relu fp32 11 +63 gpu conv fp32 11 add fp32 1 +64 gpu batchnorm fp32 11 +65 gpu add fp32 11 +66 gpu relu fp32 11 +67 gpu conv fp32 11 add fp32 1 +68 gpu batchnorm fp32 11 +69 gpu relu fp32 11 +70 gpu conv fp32 11 add fp32 1 +71 gpu batchnorm fp32 11 +72 gpu relu fp32 11 +73 gpu conv fp32 11 add fp32 1 +74 gpu batchnorm fp32 11 +75 gpu add fp32 11 +76 gpu relu fp32 11 +77 gpu conv fp32 11 add fp32 1 +78 gpu batchnorm fp32 11 +79 gpu relu fp32 11 +80 gpu conv fp32 11 add fp32 1 +81 gpu batchnorm fp32 11 +82 gpu relu fp32 11 +83 gpu conv fp32 11 add fp32 1 +84 gpu batchnorm fp32 11 +85 gpu conv fp32 11 add fp32 1 +86 gpu batchnorm fp32 11 +87 gpu add fp32 11 +88 gpu relu fp32 11 +89 gpu conv fp32 11 add fp32 1 +90 gpu batchnorm fp32 11 +91 gpu relu fp32 11 +92 gpu conv fp32 11 add fp32 1 +93 gpu batchnorm fp32 11 +94 gpu relu fp32 11 +95 gpu conv fp32 11 add fp32 1 +96 gpu batchnorm fp32 11 +97 gpu add fp32 11 +98 gpu relu fp32 11 +99 gpu conv fp32 11 add fp32 1 +100 gpu batchnorm fp32 11 +101 gpu relu fp32 11 +102 gpu conv fp32 11 add fp32 1 +103 gpu batchnorm fp32 11 +104 gpu relu fp32 11 +105 gpu conv fp32 11 add fp32 1 +106 gpu batchnorm fp32 11 +107 gpu add fp32 11 +108 gpu relu fp32 11 +109 gpu conv fp32 11 add fp32 1 +110 gpu batchnorm fp32 11 +111 gpu relu fp32 11 +112 gpu conv fp32 11 add fp32 1 +113 gpu batchnorm fp32 11 +114 gpu relu fp32 11 +115 gpu conv fp32 11 add fp32 1 +116 gpu batchnorm fp32 11 +117 gpu add fp32 11 +118 gpu relu fp32 11 +119 gpu conv fp32 11 add fp32 1 +120 gpu batchnorm fp32 11 +121 gpu relu fp32 11 +122 gpu conv fp32 11 add fp32 1 +123 gpu batchnorm fp32 11 +124 gpu relu fp32 11 +125 gpu conv fp32 11 add fp32 1 +126 gpu batchnorm fp32 11 +127 gpu add fp32 11 +128 gpu relu fp32 11 +129 gpu conv fp32 11 add fp32 1 +130 gpu batchnorm fp32 11 +131 gpu relu fp32 11 +132 gpu conv fp32 11 add fp32 1 +133 gpu batchnorm fp32 11 +134 gpu relu fp32 11 +135 gpu conv fp32 11 add fp32 1 +136 gpu batchnorm fp32 11 +137 gpu add fp32 11 +138 gpu relu fp32 11 +139 gpu conv fp32 11 add fp32 1 +140 gpu batchnorm fp32 11 +141 gpu relu fp32 11 +142 gpu conv fp32 11 add fp32 1 +143 gpu batchnorm fp32 11 +144 gpu relu fp32 11 +145 gpu conv fp32 11 add fp32 1 +146 gpu batchnorm fp32 11 +147 gpu conv fp32 11 add fp32 1 +148 gpu batchnorm fp32 11 +149 gpu add fp32 11 +150 gpu relu fp32 11 +151 gpu conv fp32 11 add fp32 1 +152 gpu batchnorm fp32 11 +153 gpu relu fp32 11 +154 gpu conv fp32 11 add fp32 1 +155 gpu batchnorm fp32 11 +156 gpu relu fp32 11 +157 gpu conv fp32 11 add fp32 1 +158 gpu batchnorm fp32 11 +159 gpu add fp32 11 +160 gpu relu fp32 11 +161 gpu conv fp32 11 add fp32 1 +162 gpu batchnorm fp32 11 +163 gpu relu fp32 11 +164 gpu conv fp32 11 add fp32 1 +165 gpu batchnorm fp32 11 +166 gpu relu fp32 11 +167 gpu conv fp32 11 add fp32 1 +168 gpu batchnorm fp32 11 +169 gpu add fp32 11 +170 gpu relu fp32 11 +171 gpu pool_max fp32 11 +172 gpu mul fp32 11 add fp32 1 +173 gpu softmax fp32 1 +----- ++++++ +conf2 1.8254789092281507 1.4527803526239977 75.7 0.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf3 1.8254789092281507 1.4527803526239977 75.7 0.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf4 1.8254789092281507 1.4527803526239977 75.7 0.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv fp16 12 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv fp16 12 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf5 1.8323072136026506 1.457112696128105 74.76 0.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv perf_fp16 157 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv fp16 12 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 152 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- ++++++ +conf6 1.8333922701839533 1.4589203187717397 74.53999999999999 1.1600000000000108 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +2 gpu batchnorm fp16 12 +3 gpu conv fp16 12 add fp16 1 +4 gpu batchnorm fp16 12 +5 gpu relu fp16 12 +6 gpu conv fp16 12 add fp16 1 +7 gpu batchnorm fp16 12 +8 gpu relu fp16 12 +9 gpu conv fp16 12 add fp16 1 +10 gpu batchnorm fp16 12 +11 gpu conv fp16 12 add fp16 1 +12 gpu batchnorm fp16 12 +13 gpu add fp16 12 +14 gpu relu fp16 12 +15 gpu conv fp16 12 add fp16 1 +16 gpu batchnorm fp16 12 +17 gpu relu fp16 12 +18 gpu conv fp16 12 add fp16 1 +19 gpu batchnorm fp16 12 +20 gpu relu fp16 12 +21 gpu conv fp16 12 add fp16 1 +22 gpu batchnorm fp16 12 +23 gpu add fp16 12 +24 gpu relu fp16 12 +25 gpu conv fp16 12 add fp16 1 +26 gpu batchnorm fp16 12 +27 gpu relu fp16 12 +28 gpu conv fp16 12 add fp16 1 +29 gpu batchnorm fp16 12 +30 gpu relu fp16 12 +31 gpu conv fp16 12 add fp16 1 +32 gpu batchnorm fp16 12 +33 gpu add fp16 12 +34 gpu relu fp16 12 +35 gpu conv fp16 12 add fp16 1 +36 gpu batchnorm fp16 12 +37 gpu relu fp16 12 +38 gpu conv fp16 12 add fp16 1 +39 gpu batchnorm fp16 12 +40 gpu relu fp16 12 +41 gpu conv fp16 12 add fp16 1 +42 gpu batchnorm fp16 12 +43 gpu conv fp16 12 add fp16 1 +44 gpu batchnorm fp16 12 +45 gpu add fp16 12 +46 gpu relu fp16 12 +47 gpu conv fp16 12 add fp16 1 +48 gpu batchnorm fp16 12 +49 gpu relu fp16 12 +50 gpu conv fp16 12 add fp16 1 +51 gpu batchnorm fp16 12 +52 gpu relu fp16 12 +53 gpu conv fp16 12 add fp16 1 +54 gpu batchnorm fp16 12 +55 gpu add fp16 12 +56 gpu relu fp16 12 +57 gpu conv fp16 12 add fp16 1 +58 gpu batchnorm fp16 12 +59 gpu relu fp16 12 +60 gpu conv fp16 12 add fp16 1 +61 gpu batchnorm fp16 12 +62 gpu relu fp16 12 +63 gpu conv fp16 12 add fp16 1 +64 gpu batchnorm fp16 12 +65 gpu add fp16 12 +66 gpu relu fp16 12 +67 gpu conv fp16 12 add fp16 1 +68 gpu batchnorm fp16 12 +69 gpu relu fp16 12 +70 gpu conv fp16 12 add fp16 1 +71 gpu batchnorm fp16 12 +72 gpu relu fp16 12 +73 gpu conv fp16 12 add fp16 1 +74 gpu batchnorm fp16 12 +75 gpu add fp16 12 +76 gpu relu fp16 12 +77 gpu conv fp16 12 add fp16 1 +78 gpu batchnorm fp16 12 +79 gpu relu fp16 12 +80 gpu conv fp16 12 add fp16 1 +81 gpu batchnorm fp16 12 +82 gpu relu fp16 12 +83 gpu conv fp16 12 add fp16 1 +84 gpu batchnorm fp16 12 +85 gpu conv fp16 12 add fp16 1 +86 gpu batchnorm fp16 12 +87 gpu add fp16 12 +88 gpu relu fp16 12 +89 gpu conv fp16 12 add fp16 1 +90 gpu batchnorm fp16 12 +91 gpu relu fp16 12 +92 gpu conv fp16 12 add fp16 1 +93 gpu batchnorm fp16 12 +94 gpu relu fp16 12 +95 gpu conv fp16 12 add fp16 1 +96 gpu batchnorm fp16 12 +97 gpu add fp16 12 +98 gpu relu fp16 12 +99 gpu conv perf_fp16 157 add fp16 1 +100 gpu batchnorm fp16 12 +101 gpu relu fp16 12 +102 gpu conv samp_fp16 267 add fp16 1 +103 gpu batchnorm fp16 12 +104 gpu relu fp16 12 +105 gpu conv fp16 12 add fp16 1 +106 gpu batchnorm fp16 12 +107 gpu add fp16 12 +108 gpu relu fp16 12 +109 gpu conv fp16 12 add fp16 1 +110 gpu batchnorm fp16 12 +111 gpu relu fp16 12 +112 gpu conv fp16 12 add fp16 1 +113 gpu batchnorm fp16 12 +114 gpu relu fp16 12 +115 gpu conv fp16 12 add fp16 1 +116 gpu batchnorm fp16 12 +117 gpu add fp16 12 +118 gpu relu fp16 12 +119 gpu conv fp16 12 add fp16 1 +120 gpu batchnorm fp16 12 +121 gpu relu fp16 12 +122 gpu conv fp16 12 add fp16 1 +123 gpu batchnorm fp16 12 +124 gpu relu fp16 12 +125 gpu conv fp16 12 add fp16 1 +126 gpu batchnorm fp16 12 +127 gpu add fp16 12 +128 gpu relu fp16 12 +129 gpu conv fp16 12 add fp16 1 +130 gpu batchnorm fp16 12 +131 gpu relu fp16 12 +132 gpu conv fp16 12 add fp16 1 +133 gpu batchnorm fp16 12 +134 gpu relu fp16 12 +135 gpu conv fp16 12 add fp16 1 +136 gpu batchnorm fp16 12 +137 gpu add fp16 12 +138 gpu relu fp16 12 +139 gpu conv fp16 12 add fp16 1 +140 gpu batchnorm fp16 12 +141 gpu relu fp16 12 +142 gpu conv fp16 12 add fp16 1 +143 gpu batchnorm fp16 12 +144 gpu relu fp16 12 +145 gpu conv fp16 12 add fp16 1 +146 gpu batchnorm fp16 12 +147 gpu conv fp16 12 add fp16 1 +148 gpu batchnorm fp16 12 +149 gpu add fp16 12 +150 gpu relu fp16 12 +151 gpu conv fp16 12 add fp16 1 +152 gpu batchnorm fp16 12 +153 gpu relu fp16 12 +154 gpu conv fp16 12 add fp16 1 +155 gpu batchnorm fp16 12 +156 gpu relu fp16 12 +157 gpu conv fp16 12 add fp16 1 +158 gpu batchnorm fp16 12 +159 gpu add fp16 12 +160 gpu relu fp16 12 +161 gpu conv fp16 12 add fp16 1 +162 gpu batchnorm fp16 12 +163 gpu relu fp16 12 +164 gpu conv perf_fp16 152 add fp16 1 +165 gpu batchnorm fp16 12 +166 gpu relu fp16 12 +167 gpu conv fp16 12 add fp16 1 +168 gpu batchnorm fp16 12 +169 gpu add fp16 12 +170 gpu relu fp16 12 +171 gpu pool_max fp16 12 +172 gpu mul fp16 12 add fp16 1 +173 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/src/resnet50_imagenet_loop.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/src/resnet50_imagenet_loop.cpp index 916f0440a7cbb3e16d024a67c8669bd73b5b7348..1458f17c5188b82c55bf2680fa749ab35db7e11a 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/src/resnet50_imagenet_loop.cpp +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/resnet50_imagenet/src/resnet50_imagenet_loop.cpp @@ -12,6 +12,8 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(1); + void *r = __visc__tensor_convolution(t1, t2, 3, 3, 2, 2); __visc__return(2, r, (size_t) 0); @@ -20,6 +22,8 @@ void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(2); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -28,6 +32,8 @@ void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_2_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(3); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -36,6 +42,8 @@ void var_2_node(void* t1, size_t bytes_t1) { void var_3_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(4); + void* r = __visc__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -44,6 +52,8 @@ void var_3_node(void* t1, size_t bytes_t1) { void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(5); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -52,6 +62,8 @@ void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(6); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -60,6 +72,8 @@ void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_6_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(7); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -68,6 +82,8 @@ void var_6_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(8); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -76,6 +92,8 @@ void var_7_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_8_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(9); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -84,6 +102,8 @@ void var_8_node(void* t1, size_t bytes_t1) { void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(10); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -92,6 +112,8 @@ void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(11); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -100,6 +122,8 @@ void var_10_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(12); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -108,6 +132,8 @@ void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_12_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(13); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -116,6 +142,8 @@ void var_12_node(void* t1, size_t bytes_t1) { void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(14); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -124,6 +152,8 @@ void var_13_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(15); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -132,6 +162,8 @@ void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(16); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -140,6 +172,8 @@ void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_16_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(17); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -148,6 +182,8 @@ void var_16_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_17_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(18); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -156,6 +192,8 @@ void var_17_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(19); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -164,6 +202,8 @@ void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(20); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -172,6 +212,8 @@ void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_20_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(21); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -180,6 +222,8 @@ void var_20_node(void* t1, size_t bytes_t1) { void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(22); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -188,6 +232,8 @@ void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(23); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -196,6 +242,8 @@ void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_23_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(24); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -204,6 +252,8 @@ void var_23_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_24_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(25); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -212,6 +262,8 @@ void var_24_node(void* t1, size_t bytes_t1) { void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(26); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -220,6 +272,8 @@ void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_26_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(27); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -228,6 +282,8 @@ void var_26_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_27_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(28); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -236,6 +292,8 @@ void var_27_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_28_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(29); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -244,6 +302,8 @@ void var_28_node(void* t1, size_t bytes_t1) { void var_29_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(30); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -252,6 +312,8 @@ void var_29_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_30_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(31); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -260,6 +322,8 @@ void var_30_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_31_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(32); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -268,6 +332,8 @@ void var_31_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_32_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(33); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -276,6 +342,8 @@ void var_32_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_33_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(34); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -284,6 +352,8 @@ void var_33_node(void* t1, size_t bytes_t1) { void var_34_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(35); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -292,6 +362,8 @@ void var_34_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_35_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(36); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -300,6 +372,8 @@ void var_35_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_36_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(37); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -308,6 +382,8 @@ void var_36_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_37_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(38); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -316,6 +392,8 @@ void var_37_node(void* t1, size_t bytes_t1) { void var_38_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(39); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -324,6 +402,8 @@ void var_38_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_39_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(40); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -332,6 +412,8 @@ void var_39_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_40_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(41); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -340,6 +422,8 @@ void var_40_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_41_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(42); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -348,6 +432,8 @@ void var_41_node(void* t1, size_t bytes_t1) { void var_42_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(43); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -356,6 +442,8 @@ void var_42_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_43_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(44); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -364,6 +452,8 @@ void var_43_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_44_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(45); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -372,6 +462,8 @@ void var_44_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_45_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(46); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -380,6 +472,8 @@ void var_45_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_46_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(47); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -388,6 +482,8 @@ void var_46_node(void* t1, size_t bytes_t1) { void var_47_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(48); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -396,6 +492,8 @@ void var_47_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_48_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(49); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -404,6 +502,8 @@ void var_48_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_49_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(50); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -412,6 +512,8 @@ void var_49_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_50_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(51); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -420,6 +522,8 @@ void var_50_node(void* t1, size_t bytes_t1) { void var_51_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(52); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -428,6 +532,8 @@ void var_51_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_52_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(53); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -436,6 +542,8 @@ void var_52_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_53_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(54); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -444,6 +552,8 @@ void var_53_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_54_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(55); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -452,6 +562,8 @@ void var_54_node(void* t1, size_t bytes_t1) { void var_55_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(56); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -460,6 +572,8 @@ void var_55_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_56_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(57); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -468,6 +582,8 @@ void var_56_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_57_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(58); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -476,6 +592,8 @@ void var_57_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_58_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(59); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -484,6 +602,8 @@ void var_58_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_59_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(60); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -492,6 +612,8 @@ void var_59_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_60_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(61); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -500,6 +622,8 @@ void var_60_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_61_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(62); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -508,6 +632,8 @@ void var_61_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_62_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(63); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -516,6 +642,8 @@ void var_62_node(void* t1, size_t bytes_t1) { void var_63_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(64); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -524,6 +652,8 @@ void var_63_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_64_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(65); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -532,6 +662,8 @@ void var_64_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_65_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(66); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -540,6 +672,8 @@ void var_65_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_66_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(67); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -548,6 +682,8 @@ void var_66_node(void* t1, size_t bytes_t1) { void var_67_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(68); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -556,6 +692,8 @@ void var_67_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_68_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(69); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -564,6 +702,8 @@ void var_68_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_69_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(70); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -572,6 +712,8 @@ void var_69_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_70_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(71); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -580,6 +722,8 @@ void var_70_node(void* t1, size_t bytes_t1) { void var_71_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(72); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -588,6 +732,8 @@ void var_71_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_72_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(73); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -596,6 +742,8 @@ void var_72_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_73_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(74); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -604,6 +752,8 @@ void var_73_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_74_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(75); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -612,6 +762,8 @@ void var_74_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_75_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(76); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -620,6 +772,8 @@ void var_75_node(void* t1, size_t bytes_t1) { void var_76_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(77); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -628,6 +782,8 @@ void var_76_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_77_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(78); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -636,6 +792,8 @@ void var_77_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_78_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(79); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -644,6 +802,8 @@ void var_78_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_79_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(80); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -652,6 +812,8 @@ void var_79_node(void* t1, size_t bytes_t1) { void var_80_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(81); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -660,6 +822,8 @@ void var_80_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_81_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(82); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -668,6 +832,8 @@ void var_81_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_82_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(83); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -676,6 +842,8 @@ void var_82_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_83_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(84); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -684,6 +852,8 @@ void var_83_node(void* t1, size_t bytes_t1) { void var_84_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(85); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -692,6 +862,8 @@ void var_84_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_85_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(86); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -700,6 +872,8 @@ void var_85_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_86_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(87); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -708,6 +882,8 @@ void var_86_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_87_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(88); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -716,6 +892,8 @@ void var_87_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_88_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(89); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -724,6 +902,8 @@ void var_88_node(void* t1, size_t bytes_t1) { void var_89_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(90); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -732,6 +912,8 @@ void var_89_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_90_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(91); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -740,6 +922,8 @@ void var_90_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_91_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(92); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -748,6 +932,8 @@ void var_91_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_92_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(93); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -756,6 +942,8 @@ void var_92_node(void* t1, size_t bytes_t1) { void var_93_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(94); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -764,6 +952,8 @@ void var_93_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_94_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(95); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -772,6 +962,8 @@ void var_94_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_95_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(96); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -780,6 +972,8 @@ void var_95_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_96_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(97); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -788,6 +982,8 @@ void var_96_node(void* t1, size_t bytes_t1) { void var_97_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(98); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -796,6 +992,8 @@ void var_97_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_98_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(99); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -804,6 +1002,8 @@ void var_98_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_99_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(100); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -812,6 +1012,8 @@ void var_99_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, void var_100_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(101); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -820,6 +1022,8 @@ void var_100_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_101_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(102); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -828,6 +1032,8 @@ void var_101_node(void* t1, size_t bytes_t1) { void var_102_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(103); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -836,6 +1042,8 @@ void var_102_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_103_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(104); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -844,6 +1052,8 @@ void var_103_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_104_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(105); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -852,6 +1062,8 @@ void var_104_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_105_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(106); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -860,6 +1072,8 @@ void var_105_node(void* t1, size_t bytes_t1) { void var_106_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(107); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -868,6 +1082,8 @@ void var_106_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_107_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(108); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -876,6 +1092,8 @@ void var_107_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_108_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(109); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -884,6 +1102,8 @@ void var_108_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_109_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(110); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -892,6 +1112,8 @@ void var_109_node(void* t1, size_t bytes_t1) { void var_110_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(111); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -900,6 +1122,8 @@ void var_110_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_111_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(112); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -908,6 +1132,8 @@ void var_111_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_112_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(113); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -916,6 +1142,8 @@ void var_112_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_113_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(114); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -924,6 +1152,8 @@ void var_113_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_114_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(115); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -932,6 +1162,8 @@ void var_114_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_115_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(116); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -940,6 +1172,8 @@ void var_115_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_116_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(117); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -948,6 +1182,8 @@ void var_116_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_117_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(118); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -956,6 +1192,8 @@ void var_117_node(void* t1, size_t bytes_t1) { void var_118_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(119); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -964,6 +1202,8 @@ void var_118_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_119_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(120); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -972,6 +1212,8 @@ void var_119_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_120_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(121); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -980,6 +1222,8 @@ void var_120_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_121_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(122); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -988,6 +1232,8 @@ void var_121_node(void* t1, size_t bytes_t1) { void var_122_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(123); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -996,6 +1242,8 @@ void var_122_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_123_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(124); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1004,6 +1252,8 @@ void var_123_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_124_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(125); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1012,6 +1262,8 @@ void var_124_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_125_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(126); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1020,6 +1272,8 @@ void var_125_node(void* t1, size_t bytes_t1) { void var_126_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(127); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1028,6 +1282,8 @@ void var_126_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_127_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(128); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1036,6 +1292,8 @@ void var_127_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_128_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(129); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1044,6 +1302,8 @@ void var_128_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_129_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(130); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1052,6 +1312,8 @@ void var_129_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_130_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(131); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1060,6 +1322,8 @@ void var_130_node(void* t1, size_t bytes_t1) { void var_131_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(132); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1068,6 +1332,8 @@ void var_131_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_132_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(133); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1076,6 +1342,8 @@ void var_132_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_133_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(134); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1084,6 +1352,8 @@ void var_133_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_134_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(135); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1092,6 +1362,8 @@ void var_134_node(void* t1, size_t bytes_t1) { void var_135_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(136); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1100,6 +1372,8 @@ void var_135_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_136_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(137); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1108,6 +1382,8 @@ void var_136_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_137_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(138); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1116,6 +1392,8 @@ void var_137_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_138_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(139); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1124,6 +1402,8 @@ void var_138_node(void* t1, size_t bytes_t1) { void var_139_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(140); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1132,6 +1412,8 @@ void var_139_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_140_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(141); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1140,6 +1422,8 @@ void var_140_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_141_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(142); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1148,6 +1432,8 @@ void var_141_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_142_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(143); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1156,6 +1442,8 @@ void var_142_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_143_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(144); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1164,6 +1452,8 @@ void var_143_node(void* t1, size_t bytes_t1) { void var_144_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(145); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1172,6 +1462,8 @@ void var_144_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_145_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(146); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1180,6 +1472,8 @@ void var_145_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_146_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(147); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1188,6 +1482,8 @@ void var_146_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_147_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(148); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1196,6 +1492,8 @@ void var_147_node(void* t1, size_t bytes_t1) { void var_148_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(149); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1204,6 +1502,8 @@ void var_148_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_149_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(150); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1212,6 +1512,8 @@ void var_149_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_150_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(151); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1220,6 +1522,8 @@ void var_150_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_151_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(152); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1228,6 +1532,8 @@ void var_151_node(void* t1, size_t bytes_t1) { void var_152_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(153); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1236,6 +1542,8 @@ void var_152_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_153_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(154); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1244,6 +1552,8 @@ void var_153_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_154_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(155); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1252,6 +1562,8 @@ void var_154_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_155_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(156); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1260,6 +1572,8 @@ void var_155_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_156_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(157); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1268,6 +1582,8 @@ void var_156_node(void* t1, size_t bytes_t1) { void var_157_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(158); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1276,6 +1592,8 @@ void var_157_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_158_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(159); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1284,6 +1602,8 @@ void var_158_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_159_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(160); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1292,6 +1612,8 @@ void var_159_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_160_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(161); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1300,6 +1622,8 @@ void var_160_node(void* t1, size_t bytes_t1) { void var_161_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(162); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1308,6 +1632,8 @@ void var_161_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_162_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(163); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1316,6 +1642,8 @@ void var_162_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_163_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(164); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1324,6 +1652,8 @@ void var_163_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_164_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(165); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1332,6 +1662,8 @@ void var_164_node(void* t1, size_t bytes_t1) { void var_165_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(166); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1340,6 +1672,8 @@ void var_165_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_166_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(167); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1348,6 +1682,8 @@ void var_166_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_167_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(168); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1356,6 +1692,8 @@ void var_167_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_168_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(169); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1364,6 +1702,8 @@ void var_168_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_169_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(170); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1372,6 +1712,8 @@ void var_169_node(void* t1, size_t bytes_t1) { void var_170_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(171); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1380,6 +1722,8 @@ void var_170_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_171_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(172); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1388,6 +1732,8 @@ void var_171_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_172_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(173); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1396,6 +1742,8 @@ void var_172_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_173_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(174); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1404,6 +1752,8 @@ void var_173_node(void* t1, size_t bytes_t1) { void var_174_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(175); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1412,6 +1762,8 @@ void var_174_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_175_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(176); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1420,6 +1772,8 @@ void var_175_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_176_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(177); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1428,6 +1782,8 @@ void var_176_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_177_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(178); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1436,6 +1792,8 @@ void var_177_node(void* t1, size_t bytes_t1) { void var_178_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(179); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1444,6 +1802,8 @@ void var_178_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_179_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(180); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1452,6 +1812,8 @@ void var_179_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_180_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(181); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1460,6 +1822,8 @@ void var_180_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_181_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(182); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1468,6 +1832,8 @@ void var_181_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_182_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(183); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1476,6 +1842,8 @@ void var_182_node(void* t1, size_t bytes_t1) { void var_183_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(184); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -1484,6 +1852,8 @@ void var_183_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_184_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(185); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1492,6 +1862,8 @@ void var_184_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_185_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(186); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1500,6 +1872,8 @@ void var_185_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_186_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(187); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1508,6 +1882,8 @@ void var_186_node(void* t1, size_t bytes_t1) { void var_187_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(188); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1516,6 +1892,8 @@ void var_187_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_188_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(189); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1524,6 +1902,8 @@ void var_188_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_189_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(190); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1532,6 +1912,8 @@ void var_189_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_190_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(191); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1540,6 +1922,8 @@ void var_190_node(void* t1, size_t bytes_t1) { void var_191_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(192); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1548,6 +1932,8 @@ void var_191_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_192_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(193); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1556,6 +1942,8 @@ void var_192_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_193_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(194); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1564,6 +1952,8 @@ void var_193_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_194_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(195); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 2, 2); __visc__return(2, r, (size_t) 0); @@ -1572,6 +1962,8 @@ void var_194_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_195_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(196); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1580,6 +1972,8 @@ void var_195_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_196_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(197); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1588,6 +1982,8 @@ void var_196_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_197_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(198); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1596,6 +1992,8 @@ void var_197_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_198_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(199); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1604,6 +2002,8 @@ void var_198_node(void* t1, size_t bytes_t1) { void var_199_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(200); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1612,6 +2012,8 @@ void var_199_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_200_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(201); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1620,6 +2022,8 @@ void var_200_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_201_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(202); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1628,6 +2032,8 @@ void var_201_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_202_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(203); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1636,6 +2042,8 @@ void var_202_node(void* t1, size_t bytes_t1) { void var_203_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(204); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1644,6 +2052,8 @@ void var_203_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_204_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(205); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1652,6 +2062,8 @@ void var_204_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_205_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(206); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1660,6 +2072,8 @@ void var_205_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_206_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(207); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1668,6 +2082,8 @@ void var_206_node(void* t1, size_t bytes_t1) { void var_207_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(208); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1676,6 +2092,8 @@ void var_207_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_208_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(209); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1684,6 +2102,8 @@ void var_208_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_209_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(210); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1692,6 +2112,8 @@ void var_209_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_210_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(211); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1700,6 +2122,8 @@ void var_210_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_211_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(212); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1708,6 +2132,8 @@ void var_211_node(void* t1, size_t bytes_t1) { void var_212_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(213); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1716,6 +2142,8 @@ void var_212_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_213_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(214); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1724,6 +2152,8 @@ void var_213_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_214_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(215); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1732,6 +2162,8 @@ void var_214_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_215_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(216); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1740,6 +2172,8 @@ void var_215_node(void* t1, size_t bytes_t1) { void var_216_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(217); + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1748,6 +2182,8 @@ void var_216_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_217_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(218); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1756,6 +2192,8 @@ void var_217_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_218_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(219); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1764,6 +2202,8 @@ void var_218_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_219_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(220); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1772,6 +2212,8 @@ void var_219_node(void* t1, size_t bytes_t1) { void var_220_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(221); + void *r = __visc__tensor_convolution(t1, t2, 0, 0, 1, 1); __visc__return(2, r, (size_t) 0); @@ -1780,6 +2222,8 @@ void var_220_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_221_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(222); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1788,6 +2232,8 @@ void var_221_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_222_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3, size_t bytes_t3, void* t4, size_t bytes_t4, void* t5, size_t bytes_t5) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(5, t1, t2, t3, t4, t5, 0); + __visc__node_id(223); + void *r = __visc__tensor_batchnorm(t1, t2, t3, t4, t5, 0.001); __visc__return(2, r, (size_t) 0); @@ -1796,6 +2242,8 @@ void var_222_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2, void* t3 void var_223_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(224); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1804,6 +2252,8 @@ void var_223_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_224_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(225); + void* r = __visc__tensor_relu(t1); __visc__return(2, r, (size_t) 0); @@ -1812,6 +2262,8 @@ void var_224_node(void* t1, size_t bytes_t1) { void var_225_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(226); + void* r = __visc__tensor_pool_mean(t1, 7, 7, 0, 0, 7, 7); __visc__return(2, r, (size_t) 0); @@ -1820,6 +2272,8 @@ void var_225_node(void* t1, size_t bytes_t1) { void var_226_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(227); + void *r = __visc__tensor_mul(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1828,6 +2282,8 @@ void var_226_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_227_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(2, t1, t2, 0); + __visc__node_id(228); + void *r = __visc__tensor_add(t1, t2); __visc__return(2, r, (size_t) 0); @@ -1836,6 +2292,8 @@ void var_227_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { void var_228_node(void* t1, size_t bytes_t1) { __visc__hint(visc::PROMISE_TARGET); __visc__attributes(1, t1, 0); + __visc__node_id(229); + void* r = __visc__tensor_softmax(t1); __visc__return(2, r, (size_t) 0); @@ -2168,6 +2626,8 @@ void root(void* input, size_t input_bytes, __visc__attributes(321, input, conv2d_1_w, conv2d_1_b, batch_normalization_1_gamma, batch_normalization_1_beta, batch_normalization_1_mean, batch_normalization_1_variance, conv2d_2_w, conv2d_2_b, batch_normalization_2_gamma, batch_normalization_2_beta, batch_normalization_2_mean, batch_normalization_2_variance, conv2d_3_w, conv2d_3_b, batch_normalization_3_gamma, batch_normalization_3_beta, batch_normalization_3_mean, batch_normalization_3_variance, conv2d_4_w, conv2d_4_b, conv2d_5_w, conv2d_5_b, batch_normalization_4_gamma, batch_normalization_4_beta, batch_normalization_4_mean, batch_normalization_4_variance, batch_normalization_5_gamma, batch_normalization_5_beta, batch_normalization_5_mean, batch_normalization_5_variance, conv2d_6_w, conv2d_6_b, batch_normalization_6_gamma, batch_normalization_6_beta, batch_normalization_6_mean, batch_normalization_6_variance, conv2d_7_w, conv2d_7_b, batch_normalization_7_gamma, batch_normalization_7_beta, batch_normalization_7_mean, batch_normalization_7_variance, conv2d_8_w, conv2d_8_b, batch_normalization_8_gamma, batch_normalization_8_beta, batch_normalization_8_mean, batch_normalization_8_variance, conv2d_9_w, conv2d_9_b, batch_normalization_9_gamma, batch_normalization_9_beta, batch_normalization_9_mean, batch_normalization_9_variance, conv2d_10_w, conv2d_10_b, batch_normalization_10_gamma, batch_normalization_10_beta, batch_normalization_10_mean, batch_normalization_10_variance, conv2d_11_w, conv2d_11_b, batch_normalization_11_gamma, batch_normalization_11_beta, batch_normalization_11_mean, batch_normalization_11_variance, conv2d_12_w, conv2d_12_b, batch_normalization_12_gamma, batch_normalization_12_beta, batch_normalization_12_mean, batch_normalization_12_variance, conv2d_13_w, conv2d_13_b, batch_normalization_13_gamma, batch_normalization_13_beta, batch_normalization_13_mean, batch_normalization_13_variance, conv2d_14_w, conv2d_14_b, conv2d_15_w, conv2d_15_b, batch_normalization_14_gamma, batch_normalization_14_beta, batch_normalization_14_mean, batch_normalization_14_variance, batch_normalization_15_gamma, batch_normalization_15_beta, batch_normalization_15_mean, batch_normalization_15_variance, conv2d_16_w, conv2d_16_b, batch_normalization_16_gamma, batch_normalization_16_beta, batch_normalization_16_mean, batch_normalization_16_variance, conv2d_17_w, conv2d_17_b, batch_normalization_17_gamma, batch_normalization_17_beta, batch_normalization_17_mean, batch_normalization_17_variance, conv2d_18_w, conv2d_18_b, batch_normalization_18_gamma, batch_normalization_18_beta, batch_normalization_18_mean, batch_normalization_18_variance, conv2d_19_w, conv2d_19_b, batch_normalization_19_gamma, batch_normalization_19_beta, batch_normalization_19_mean, batch_normalization_19_variance, conv2d_20_w, conv2d_20_b, batch_normalization_20_gamma, batch_normalization_20_beta, batch_normalization_20_mean, batch_normalization_20_variance, conv2d_21_w, conv2d_21_b, batch_normalization_21_gamma, batch_normalization_21_beta, batch_normalization_21_mean, batch_normalization_21_variance, conv2d_22_w, conv2d_22_b, batch_normalization_22_gamma, batch_normalization_22_beta, batch_normalization_22_mean, batch_normalization_22_variance, conv2d_23_w, conv2d_23_b, batch_normalization_23_gamma, batch_normalization_23_beta, batch_normalization_23_mean, batch_normalization_23_variance, conv2d_24_w, conv2d_24_b, batch_normalization_24_gamma, batch_normalization_24_beta, batch_normalization_24_mean, batch_normalization_24_variance, conv2d_25_w, conv2d_25_b, batch_normalization_25_gamma, batch_normalization_25_beta, batch_normalization_25_mean, batch_normalization_25_variance, conv2d_26_w, conv2d_26_b, batch_normalization_26_gamma, batch_normalization_26_beta, batch_normalization_26_mean, batch_normalization_26_variance, conv2d_27_w, conv2d_27_b, conv2d_28_w, conv2d_28_b, batch_normalization_27_gamma, batch_normalization_27_beta, batch_normalization_27_mean, batch_normalization_27_variance, batch_normalization_28_gamma, batch_normalization_28_beta, batch_normalization_28_mean, batch_normalization_28_variance, conv2d_29_w, conv2d_29_b, batch_normalization_29_gamma, batch_normalization_29_beta, batch_normalization_29_mean, batch_normalization_29_variance, conv2d_30_w, conv2d_30_b, batch_normalization_30_gamma, batch_normalization_30_beta, batch_normalization_30_mean, batch_normalization_30_variance, conv2d_31_w, conv2d_31_b, batch_normalization_31_gamma, batch_normalization_31_beta, batch_normalization_31_mean, batch_normalization_31_variance, conv2d_32_w, conv2d_32_b, batch_normalization_32_gamma, batch_normalization_32_beta, batch_normalization_32_mean, batch_normalization_32_variance, conv2d_33_w, conv2d_33_b, batch_normalization_33_gamma, batch_normalization_33_beta, batch_normalization_33_mean, batch_normalization_33_variance, conv2d_34_w, conv2d_34_b, batch_normalization_34_gamma, batch_normalization_34_beta, batch_normalization_34_mean, batch_normalization_34_variance, conv2d_35_w, conv2d_35_b, batch_normalization_35_gamma, batch_normalization_35_beta, batch_normalization_35_mean, batch_normalization_35_variance, conv2d_36_w, conv2d_36_b, batch_normalization_36_gamma, batch_normalization_36_beta, batch_normalization_36_mean, batch_normalization_36_variance, conv2d_37_w, conv2d_37_b, batch_normalization_37_gamma, batch_normalization_37_beta, batch_normalization_37_mean, batch_normalization_37_variance, conv2d_38_w, conv2d_38_b, batch_normalization_38_gamma, batch_normalization_38_beta, batch_normalization_38_mean, batch_normalization_38_variance, conv2d_39_w, conv2d_39_b, batch_normalization_39_gamma, batch_normalization_39_beta, batch_normalization_39_mean, batch_normalization_39_variance, conv2d_40_w, conv2d_40_b, batch_normalization_40_gamma, batch_normalization_40_beta, batch_normalization_40_mean, batch_normalization_40_variance, conv2d_41_w, conv2d_41_b, batch_normalization_41_gamma, batch_normalization_41_beta, batch_normalization_41_mean, batch_normalization_41_variance, conv2d_42_w, conv2d_42_b, batch_normalization_42_gamma, batch_normalization_42_beta, batch_normalization_42_mean, batch_normalization_42_variance, conv2d_43_w, conv2d_43_b, batch_normalization_43_gamma, batch_normalization_43_beta, batch_normalization_43_mean, batch_normalization_43_variance, conv2d_44_w, conv2d_44_b, batch_normalization_44_gamma, batch_normalization_44_beta, batch_normalization_44_mean, batch_normalization_44_variance, conv2d_45_w, conv2d_45_b, batch_normalization_45_gamma, batch_normalization_45_beta, batch_normalization_45_mean, batch_normalization_45_variance, conv2d_46_w, conv2d_46_b, conv2d_47_w, conv2d_47_b, batch_normalization_46_gamma, batch_normalization_46_beta, batch_normalization_46_mean, batch_normalization_46_variance, batch_normalization_47_gamma, batch_normalization_47_beta, batch_normalization_47_mean, batch_normalization_47_variance, conv2d_48_w, conv2d_48_b, batch_normalization_48_gamma, batch_normalization_48_beta, batch_normalization_48_mean, batch_normalization_48_variance, conv2d_49_w, conv2d_49_b, batch_normalization_49_gamma, batch_normalization_49_beta, batch_normalization_49_mean, batch_normalization_49_variance, conv2d_50_w, conv2d_50_b, batch_normalization_50_gamma, batch_normalization_50_beta, batch_normalization_50_mean, batch_normalization_50_variance, conv2d_51_w, conv2d_51_b, batch_normalization_51_gamma, batch_normalization_51_beta, batch_normalization_51_mean, batch_normalization_51_variance, conv2d_52_w, conv2d_52_b, batch_normalization_52_gamma, batch_normalization_52_beta, batch_normalization_52_mean, batch_normalization_52_variance, conv2d_53_w, conv2d_53_b, batch_normalization_53_gamma, batch_normalization_53_beta, batch_normalization_53_mean, batch_normalization_53_variance, dense_1_w, dense_1_b, 0); + + void* var_0 = __visc__createNodeND(0, var_0_node); __visc__bindIn(var_0, 0, 0, 0); diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/Makefile index 84539b3b05cb8ad8a77d9842812b69a4f1b17916..cf001e5dc9187b9562959af3223970ed9dce9b97 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/Makefile @@ -13,7 +13,6 @@ SRC_DIR = src BUILD_DIR = build APP = vgg16_cifar10 - TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a @@ -24,6 +23,7 @@ TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_au CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp + HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib @@ -35,12 +35,16 @@ PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(A VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-promise -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt - -CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs_base.txt +CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -64,14 +68,20 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/final_accuracy b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/final_accuracy deleted file mode 100644 index 327358db8f07ecc900e3d5c5e23d99c194dcc4f0..0000000000000000000000000000000000000000 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/final_accuracy +++ /dev/null @@ -1 +0,0 @@ -89.500000 \ No newline at end of file diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.ll b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.ll index c886277a9b4f2cf13f5b17d8639cd9f4b059898b..3bd634b7f1e294760d8c9092f53d8dd9e429b446 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.ll +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.ll @@ -3,9 +3,13 @@ source_filename = "src/vgg16_cifar10.cpp" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +%"class.std::ios_base::Init" = type { i8 } +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } +%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -%struct.Tensor = type { i32, i32, i32, %struct.cudnnTensorStruct*, %struct.cudnnFilterStruct*, i8*, i8*, i64, i64, %struct.Dimension } +%struct.Tensor = type { i32, i32, i32, i32, %struct.cudnnTensorStruct*, %struct.cudnnFilterStruct*, %struct.cudnnTensorStruct*, %struct.cudnnFilterStruct*, i8*, i8*, i8*, i64, i64, %struct.Dimension } %struct.cudnnTensorStruct = type opaque %struct.cudnnFilterStruct = type opaque %struct.Dimension = type { i32, i64* } @@ -30,74 +34,127 @@ target triple = "x86_64-unknown-linux-gnu" %struct.__locale_data = type opaque %"class.std::num_put" = type { %"class.std::locale::facet.base", [4 x i8] } %"class.std::num_get" = type { %"class.std::locale::facet.base", [4 x i8] } - -@.str.1 = private unnamed_addr constant [19 x i8] c"tensor dims = %d \0A\00", align 1 -@.str.2 = private unnamed_addr constant [18 x i8] c"dim1_size = %zu \0A\00", align 1 -@.str.3 = private unnamed_addr constant [18 x i8] c"dim2_size = %zu \0A\00", align 1 -@.str.4 = private unnamed_addr constant [18 x i8] c"num_elems = %zu \0A\00", align 1 -@.str.5 = private unnamed_addr constant [3 x i8] c"wb\00", align 1 -@.str.6 = private unnamed_addr constant [58 x i8] c"File %s could not be created. Check if directory exists \0A\00", align 1 -@.str.7 = private unnamed_addr constant [22 x i8] c"size_in_bytes = %zu \0A\00", align 1 -@.str.8 = private unnamed_addr constant [21 x i8] c"bytes_written = %zu\0A\00", align 1 -@.str.9 = private unnamed_addr constant [4 x i8] c"%f,\00", align 1 -@.str.11 = private unnamed_addr constant [18 x i8] c"Num_elems = %zu \0A\00", align 1 -@.str.12 = private unnamed_addr constant [16 x i8] c"dim[%d] = %zu \0A\00", align 1 -@.str.13 = private unnamed_addr constant [35 x i8] c"Tensor data mismatch at index %d \0A\00", align 1 -@.str.14 = private unnamed_addr constant [21 x i8] c"Tensor data mismatch\00", align 1 -@.str.15 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 -@.str.16 = private unnamed_addr constant [41 x i8] c"Data file %s is not found. Aborting... \0A\00", align 1 -@.str.17 = private unnamed_addr constant [23 x i8] c"tensor_data[%d] = %f \0A\00", align 1 -@.str.18 = private unnamed_addr constant [40 x i8] c"Data file %s is not found. Aborting...\0A\00", align 1 -@.str.19 = private unnamed_addr constant [26 x i8] c"*Label bytes_read = %zu \0A\00", align 1 -@.str.20 = private unnamed_addr constant [24 x i8] c"****** Accuracy = %f \0A\0A\00", align 1 -@.str.21 = private unnamed_addr constant [15 x i8] c"final_accuracy\00", align 1 -@.str.22 = private unnamed_addr constant [3 x i8] c"w+\00", align 1 -@.str.23 = private unnamed_addr constant [72 x i8] c"../../../../../../projects/hpvm-tensor-rt/model_params/vgg16_cifar10_2/\00", align 1 -@.str.24 = private unnamed_addr constant [10 x i8] c"input.bin\00", align 1 -@.str.25 = private unnamed_addr constant [11 x i8] c"labels.bin\00", align 1 -@.str.26 = private unnamed_addr constant [15 x i8] c"conv2d_1_w.bin\00", align 1 -@.str.27 = private unnamed_addr constant [15 x i8] c"conv2d_1_b.bin\00", align 1 -@.str.28 = private unnamed_addr constant [15 x i8] c"conv2d_2_w.bin\00", align 1 -@.str.29 = private unnamed_addr constant [15 x i8] c"conv2d_2_b.bin\00", align 1 -@.str.30 = private unnamed_addr constant [15 x i8] c"conv2d_3_w.bin\00", align 1 -@.str.31 = private unnamed_addr constant [15 x i8] c"conv2d_3_b.bin\00", align 1 -@.str.32 = private unnamed_addr constant [15 x i8] c"conv2d_4_w.bin\00", align 1 -@.str.33 = private unnamed_addr constant [15 x i8] c"conv2d_4_b.bin\00", align 1 -@.str.34 = private unnamed_addr constant [15 x i8] c"conv2d_5_w.bin\00", align 1 -@.str.35 = private unnamed_addr constant [15 x i8] c"conv2d_5_b.bin\00", align 1 -@.str.36 = private unnamed_addr constant [15 x i8] c"conv2d_6_w.bin\00", align 1 -@.str.37 = private unnamed_addr constant [15 x i8] c"conv2d_6_b.bin\00", align 1 -@.str.38 = private unnamed_addr constant [15 x i8] c"conv2d_7_w.bin\00", align 1 -@.str.39 = private unnamed_addr constant [15 x i8] c"conv2d_7_b.bin\00", align 1 -@.str.40 = private unnamed_addr constant [15 x i8] c"conv2d_8_w.bin\00", align 1 -@.str.41 = private unnamed_addr constant [15 x i8] c"conv2d_8_b.bin\00", align 1 -@.str.42 = private unnamed_addr constant [15 x i8] c"conv2d_9_w.bin\00", align 1 -@.str.43 = private unnamed_addr constant [15 x i8] c"conv2d_9_b.bin\00", align 1 -@.str.44 = private unnamed_addr constant [16 x i8] c"conv2d_10_w.bin\00", align 1 -@.str.45 = private unnamed_addr constant [16 x i8] c"conv2d_10_b.bin\00", align 1 -@.str.46 = private unnamed_addr constant [16 x i8] c"conv2d_11_w.bin\00", align 1 -@.str.47 = private unnamed_addr constant [16 x i8] c"conv2d_11_b.bin\00", align 1 -@.str.48 = private unnamed_addr constant [16 x i8] c"conv2d_12_w.bin\00", align 1 -@.str.49 = private unnamed_addr constant [16 x i8] c"conv2d_12_b.bin\00", align 1 -@.str.50 = private unnamed_addr constant [16 x i8] c"conv2d_13_w.bin\00", align 1 -@.str.51 = private unnamed_addr constant [16 x i8] c"conv2d_13_b.bin\00", align 1 -@.str.52 = private unnamed_addr constant [14 x i8] c"dense_1_w.bin\00", align 1 -@.str.53 = private unnamed_addr constant [14 x i8] c"dense_1_b.bin\00", align 1 -@.str.54 = private unnamed_addr constant [14 x i8] c"dense_2_w.bin\00", align 1 -@.str.55 = private unnamed_addr constant [14 x i8] c"dense_2_b.bin\00", align 1 +%struct.ClassProb = type { float, i32 } + +$_ZNSt6vectorIfSaIfEED2Ev = comdat any + +$_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_ = comdat any + +$_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_ = comdat any + +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external global i8 +@run_accuracies = global %"class.std::vector" zeroinitializer, align 8 +@.str.2 = private unnamed_addr constant [19 x i8] c"tensor dims = %d \0A\00", align 1 +@.str.3 = private unnamed_addr constant [18 x i8] c"dim1_size = %lu \0A\00", align 1 +@.str.4 = private unnamed_addr constant [18 x i8] c"dim2_size = %lu \0A\00", align 1 +@.str.5 = private unnamed_addr constant [18 x i8] c"num_elems = %lu \0A\00", align 1 +@.str.6 = private unnamed_addr constant [3 x i8] c"wb\00", align 1 +@.str.7 = private unnamed_addr constant [58 x i8] c"File %s could not be created. Check if directory exists \0A\00", align 1 +@.str.8 = private unnamed_addr constant [4 x i8] c"%f,\00", align 1 +@.str.10 = private unnamed_addr constant [18 x i8] c"Num_elems = %lu \0A\00", align 1 +@.str.11 = private unnamed_addr constant [16 x i8] c"dim[%d] = %lu \0A\00", align 1 +@.str.12 = private unnamed_addr constant [35 x i8] c"Tensor data mismatch at index %d \0A\00", align 1 +@.str.13 = private unnamed_addr constant [21 x i8] c"Tensor data mismatch\00", align 1 +@.str.14 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 +@.str.15 = private unnamed_addr constant [41 x i8] c"Data file %s is not found. Aborting... \0A\00", align 1 +@.str.16 = private unnamed_addr constant [40 x i8] c"size in bytes = %lu, bytes read = %lu \0A\00", align 1 +@.str.17 = private unnamed_addr constant [23 x i8] c"size_in_bytes = %lu \0A\00", align 1 +@.str.18 = private unnamed_addr constant [31 x i8] c"******NOTE: tensor Dims = %d \0A\00", align 1 +@.str.20 = private unnamed_addr constant [40 x i8] c"Data file %s is not found. Aborting...\0A\00", align 1 +@.str.21 = private unnamed_addr constant [24 x i8] c"****** Accuracy = %f \0A\0A\00", align 1 +@.str.22 = private unnamed_addr constant [15 x i8] c"final_accuracy\00", align 1 +@.str.23 = private unnamed_addr constant [3 x i8] c"w+\00", align 1 +@.str.24 = private unnamed_addr constant [34 x i8] c"batch_dim = %lu, channels = %lu \0A\00", align 1 +@.str.25 = private unnamed_addr constant [37 x i8] c"batch_dim = %lu, num_classes = %lu \0A\00", align 1 +@.str.26 = private unnamed_addr constant [30 x i8] c"\0A\0A **** Final Accuracy = %f \0A\00", align 1 +@.str.27 = private unnamed_addr constant [9 x i8] c"avg_psnr\00", align 1 +@.str.28 = private unnamed_addr constant [13 x i8] c"psnr_std.txt\00", align 1 +@.str.29 = private unnamed_addr constant [19 x i8] c"run_accuracies.txt\00", align 1 +@.str.30 = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@.str.32 = private unnamed_addr constant [3 x i8] c"%f\00", align 1 +@.str.33 = private unnamed_addr constant [23 x i8] c"**** PSNR read = %f \0A\0A\00", align 1 +@.str.34 = private unnamed_addr constant [9 x i8] c"psnr.txt\00", align 1 +@.str.35 = private unnamed_addr constant [36 x i8] c"batch_dim = %lu, image_size = %lu \0A\00", align 1 +@.str.36 = private unnamed_addr constant [13 x i8] c"img_psnr.txt\00", align 1 +@.str.37 = private unnamed_addr constant [18 x i8] c"PSNR value = %f \0A\00", align 1 +@.str.38 = private unnamed_addr constant [26 x i8] c"*** violation_rate= %f \0A\0A\00", align 1 +@.str.39 = private unnamed_addr constant [22 x i8] c"*** avg_psnr = %f \0A\0A\00", align 1 +@.str.40 = private unnamed_addr constant [23 x i8] c"** Output size = %lu \0A\00", align 1 +@.str.41 = private unnamed_addr constant [70 x i8] c"../../../../../../projects/hpvm-tensor-rt/model_params/vgg16_cifar10/\00", align 1 +@.str.42 = private unnamed_addr constant [10 x i8] c"input.bin\00", align 1 +@.str.43 = private unnamed_addr constant [13 x i8] c"labels32.bin\00", align 1 +@.str.44 = private unnamed_addr constant [15 x i8] c"conv2d_1_w.bin\00", align 1 +@.str.45 = private unnamed_addr constant [15 x i8] c"conv2d_1_b.bin\00", align 1 +@.str.46 = private unnamed_addr constant [15 x i8] c"conv2d_2_w.bin\00", align 1 +@.str.47 = private unnamed_addr constant [15 x i8] c"conv2d_2_b.bin\00", align 1 +@.str.48 = private unnamed_addr constant [15 x i8] c"conv2d_3_w.bin\00", align 1 +@.str.49 = private unnamed_addr constant [15 x i8] c"conv2d_3_b.bin\00", align 1 +@.str.50 = private unnamed_addr constant [15 x i8] c"conv2d_4_w.bin\00", align 1 +@.str.51 = private unnamed_addr constant [15 x i8] c"conv2d_4_b.bin\00", align 1 +@.str.52 = private unnamed_addr constant [15 x i8] c"conv2d_5_w.bin\00", align 1 +@.str.53 = private unnamed_addr constant [15 x i8] c"conv2d_5_b.bin\00", align 1 +@.str.54 = private unnamed_addr constant [15 x i8] c"conv2d_6_w.bin\00", align 1 +@.str.55 = private unnamed_addr constant [15 x i8] c"conv2d_6_b.bin\00", align 1 +@.str.56 = private unnamed_addr constant [15 x i8] c"conv2d_7_w.bin\00", align 1 +@.str.57 = private unnamed_addr constant [15 x i8] c"conv2d_7_b.bin\00", align 1 +@.str.58 = private unnamed_addr constant [15 x i8] c"conv2d_8_w.bin\00", align 1 +@.str.59 = private unnamed_addr constant [15 x i8] c"conv2d_8_b.bin\00", align 1 +@.str.60 = private unnamed_addr constant [15 x i8] c"conv2d_9_w.bin\00", align 1 +@.str.61 = private unnamed_addr constant [15 x i8] c"conv2d_9_b.bin\00", align 1 +@.str.62 = private unnamed_addr constant [16 x i8] c"conv2d_10_w.bin\00", align 1 +@.str.63 = private unnamed_addr constant [16 x i8] c"conv2d_10_b.bin\00", align 1 +@.str.64 = private unnamed_addr constant [16 x i8] c"conv2d_11_w.bin\00", align 1 +@.str.65 = private unnamed_addr constant [16 x i8] c"conv2d_11_b.bin\00", align 1 +@.str.66 = private unnamed_addr constant [16 x i8] c"conv2d_12_w.bin\00", align 1 +@.str.67 = private unnamed_addr constant [16 x i8] c"conv2d_12_b.bin\00", align 1 +@.str.68 = private unnamed_addr constant [16 x i8] c"conv2d_13_w.bin\00", align 1 +@.str.69 = private unnamed_addr constant [16 x i8] c"conv2d_13_b.bin\00", align 1 +@.str.70 = private unnamed_addr constant [14 x i8] c"dense_1_w.bin\00", align 1 +@.str.71 = private unnamed_addr constant [14 x i8] c"dense_1_b.bin\00", align 1 +@.str.72 = private unnamed_addr constant [14 x i8] c"dense_2_w.bin\00", align 1 +@.str.73 = private unnamed_addr constant [14 x i8] c"dense_2_b.bin\00", align 1 @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE = external unnamed_addr constant { [5 x i8*], [5 x i8*] } @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE = external unnamed_addr constant [4 x i8*] @_ZTVSt9basic_iosIcSt11char_traitsIcEE = external unnamed_addr constant { [4 x i8*] } @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE = external unnamed_addr constant { [16 x i8*] } @_ZTVSt15basic_streambufIcSt11char_traitsIcEE = external unnamed_addr constant { [16 x i8*] } +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_vgg16_cifar10.cpp, i8* null }] @str = private unnamed_addr constant [23 x i8] c"Successful cudaMalloc \00" +@str.78 = private unnamed_addr constant [27 x i8] c"ERROR: NULL data pointers \00" +@str.79 = private unnamed_addr constant [28 x i8] c"ERROR: psnr.txt not found! \00" + +declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) unnamed_addr #0 + +; Function Attrs: nounwind +declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1 + +; Function Attrs: nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #2 + +; Function Attrs: nounwind uwtable +define linkonce_odr void @_ZNSt6vectorIfSaIfEED2Ev(%"class.std::vector"* %this) unnamed_addr #3 comdat align 2 { +entry: + %_M_start.i = getelementptr inbounds %"class.std::vector", %"class.std::vector"* %this, i64 0, i32 0, i32 0, i32 0 + %0 = load float*, float** %_M_start.i, align 8, !tbaa !1 + %tobool.i.i = icmp eq float* %0, null + br i1 %tobool.i.i, label %_ZNSt12_Vector_baseIfSaIfEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %entry + %1 = bitcast float* %0 to i8* + tail call void @_ZdlPv(i8* %1) #2 + br label %_ZNSt12_Vector_baseIfSaIfEED2Ev.exit + +_ZNSt12_Vector_baseIfSaIfEED2Ev.exit: ; preds = %entry, %if.then.i.i + ret void +} ; Function Attrs: nounwind uwtable -define void @_Z15printTensorInfoPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #0 { +define void @_Z15printTensorInfoPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #3 { entry: - %gpu_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 40 + %gpu_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 56 %0 = bitcast i8* %gpu_data to i8** - %1 = load i8*, i8** %0, align 8, !tbaa !1 + %1 = load i8*, i8** %0, align 8, !tbaa !7 %cmp = icmp eq i8* %1, null br i1 %cmp, label %if.end, label %if.then @@ -106,93 +163,90 @@ if.then: ; preds = %entry br label %if.end if.end: ; preds = %entry, %if.then - %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 64 + %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 88 %num_dims = bitcast i8* %dims to i32* - %2 = load i32, i32* %num_dims, align 8, !tbaa !10 - %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.1, i64 0, i64 0), i32 %2) - %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 + %2 = load i32, i32* %num_dims, align 8, !tbaa !13 + %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.2, i64 0, i64 0), i32 %2) + %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 96 %3 = bitcast i8* %dim_sizes to i64** - %4 = load i64*, i64** %3, align 8, !tbaa !11 - %5 = load i64, i64* %4, align 8, !tbaa !12 - %call3 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.2, i64 0, i64 0), i64 %5) - %6 = load i64*, i64** %3, align 8, !tbaa !11 + %4 = load i64*, i64** %3, align 8, !tbaa !14 + %5 = load i64, i64* %4, align 8, !tbaa !15 + %call3 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.3, i64 0, i64 0), i64 %5) + %6 = load i64*, i64** %3, align 8, !tbaa !14 %arrayidx6 = getelementptr inbounds i64, i64* %6, i64 1 - %7 = load i64, i64* %arrayidx6, align 8, !tbaa !12 - %call7 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.3, i64 0, i64 0), i64 %7) - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %7 = load i64, i64* %arrayidx6, align 8, !tbaa !15 + %call7 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.4, i64 0, i64 0), i64 %7) + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %8 = bitcast i8* %num_elems to i64* - %9 = load i64, i64* %8, align 8, !tbaa !13 - %call8 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.4, i64 0, i64 0), i64 %9) + %9 = load i64, i64* %8, align 8, !tbaa !16 + %call8 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.5, i64 0, i64 0), i64 %9) ret void } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 +declare void @llvm.lifetime.start(i64, i8* nocapture) #4 ; Function Attrs: nounwind -declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #2 +declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end(i64, i8* nocapture) #4 ; Function Attrs: nounwind uwtable -define void @_Z17dumpWeightsToFilePcPv(i8* %file_name, i8* %weights_ptr) local_unnamed_addr #0 { +define void @_Z17dumpWeightsToFilePcPv(i8* %file_name, i8* %weights_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %weights_ptr, i32 0) #7 - %call = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.5, i64 0, i64 0)) + tail call void @hpvm_request_tensor(i8* %weights_ptr, i32 0) #2 + %call = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0)) %cmp = icmp eq %struct._IO_FILE* %call, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([58 x i8], [58 x i8]* @.str.6, i64 0, i64 0), i8* %file_name) - tail call void @abort() #8 + %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([58 x i8], [58 x i8]* @.str.7, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable if.end: ; preds = %entry - %size_in_bytes = getelementptr inbounds i8, i8* %weights_ptr, i64 56 - %0 = bitcast i8* %size_in_bytes to i64* - %1 = load i64, i64* %0, align 8, !tbaa !14 - %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.7, i64 0, i64 0), i64 %1) - %host_data = getelementptr inbounds i8, i8* %weights_ptr, i64 32 - %2 = bitcast i8* %host_data to i8** - %3 = load i8*, i8** %2, align 8, !tbaa !15 - %4 = load i64, i64* %0, align 8, !tbaa !14 - %call4 = tail call i64 @fwrite(i8* %3, i64 1, i64 %4, %struct._IO_FILE* nonnull %call) - %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.8, i64 0, i64 0), i64 %call4) - %call6 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call) + %host_data = getelementptr inbounds i8, i8* %weights_ptr, i64 48 + %0 = bitcast i8* %host_data to i8** + %1 = load i8*, i8** %0, align 8, !tbaa !17 + %size_in_bytes = getelementptr inbounds i8, i8* %weights_ptr, i64 80 + %2 = bitcast i8* %size_in_bytes to i64* + %3 = load i64, i64* %2, align 8, !tbaa !18 + %call2 = tail call i64 @fwrite(i8* %1, i64 1, i64 %3, %struct._IO_FILE* nonnull %call) + %call3 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call) ret void } -declare void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #3 +declare void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #0 ; Function Attrs: nounwind -declare noalias %struct._IO_FILE* @fopen(i8* nocapture readonly, i8* nocapture readonly) local_unnamed_addr #2 +declare noalias %struct._IO_FILE* @fopen(i8* nocapture readonly, i8* nocapture readonly) local_unnamed_addr #1 ; Function Attrs: noreturn nounwind -declare void @abort() local_unnamed_addr #4 +declare void @abort() local_unnamed_addr #5 ; Function Attrs: nounwind -declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #2 +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #1 ; Function Attrs: nounwind -declare i32 @fclose(%struct._IO_FILE* nocapture) local_unnamed_addr #2 +declare i32 @fclose(%struct._IO_FILE* nocapture) local_unnamed_addr #1 ; Function Attrs: nounwind uwtable -define void @_Z18fillTensorWithOnesPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z18fillTensorWithOnesPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !16 + %0 = load i32, i32* %data_type, align 8, !tbaa !19 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !15 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !17 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !13 + %4 = load i64, i64* %3, align 8, !tbaa !16 %cmp110 = icmp eq i64 %4, 0 br i1 %cmp110, label %if.end, label %for.body.preheader @@ -230,14 +284,14 @@ vector.body.prol: ; preds = %vector.body.prol, % %prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ] %13 = getelementptr inbounds float, float* %2, i64 %index.prol %14 = bitcast float* %13 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !20 %15 = getelementptr float, float* %13, i64 4 %16 = bitcast float* %15 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !20 %index.next.prol = add i64 %index.prol, 8 %prol.iter.sub = add i64 %prol.iter, -1 %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 - br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !19 + br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !22 vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol br label %vector.body.prol.loopexit @@ -254,62 +308,62 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.7, %vector.body ] %18 = getelementptr inbounds float, float* %2, i64 %index %19 = bitcast float* %18 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !20 %20 = getelementptr float, float* %18, i64 4 %21 = bitcast float* %20 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !20 %index.next = add i64 %index, 8 %22 = getelementptr inbounds float, float* %2, i64 %index.next %23 = bitcast float* %22 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !20 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !20 %index.next.1 = add i64 %index, 16 %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !20 %28 = getelementptr float, float* %26, i64 4 %29 = bitcast float* %28 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !20 %index.next.2 = add i64 %index, 24 %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 %31 = bitcast float* %30 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !20 %32 = getelementptr float, float* %30, i64 4 %33 = bitcast float* %32 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !20 %index.next.3 = add i64 %index, 32 %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 %35 = bitcast float* %34 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !20 %36 = getelementptr float, float* %34, i64 4 %37 = bitcast float* %36 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !20 %index.next.4 = add i64 %index, 40 %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !20 %40 = getelementptr float, float* %38, i64 4 %41 = bitcast float* %40 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !20 %index.next.5 = add i64 %index, 48 %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 %43 = bitcast float* %42 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !20 %44 = getelementptr float, float* %42, i64 4 %45 = bitcast float* %44 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !20 %index.next.6 = add i64 %index, 56 %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 %47 = bitcast float* %46 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !20 %48 = getelementptr float, float* %46, i64 4 %49 = bitcast float* %48 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !20 %index.next.7 = add i64 %index, 64 %50 = icmp eq i64 %index.next.7, %n.vec - br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !21 + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !24 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -327,11 +381,11 @@ for.body: ; preds = %for.body.preheader2 %conv12 = phi i64 [ %conv, %for.body ], [ %conv12.ph, %for.body.preheader22 ] %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader22 ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv12 - store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !17 + store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !20 %inc = add i32 %i.011, 1 %conv = zext i32 %inc to i64 %cmp1 = icmp ult i64 %conv, %4 - br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !24 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !27 if.end.loopexit: ; preds = %for.body br label %if.end @@ -341,21 +395,21 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: nounwind uwtable -define void @_Z19fillWithOnesAndTwosPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z19fillWithOnesAndTwosPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !16 + %0 = load i32, i32* %data_type, align 8, !tbaa !19 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !15 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !17 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !13 + %4 = load i64, i64* %3, align 8, !tbaa !16 %div35 = lshr i64 %4, 1 %cmp136 = icmp eq i64 %div35, 0 br i1 %cmp136, label %for.cond.cleanup, label %for.body.preheader @@ -399,14 +453,14 @@ vector.body.prol: ; preds = %vector.body.prol, % %prol.iter88 = phi i64 [ %prol.iter88.sub, %vector.body.prol ], [ %xtraiter86, %vector.body.prol.preheader ] %13 = getelementptr inbounds float, float* %2, i64 %index.prol %14 = bitcast float* %13 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !20 %15 = getelementptr float, float* %13, i64 4 %16 = bitcast float* %15 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !20 %index.next.prol = add i64 %index.prol, 8 %prol.iter88.sub = add i64 %prol.iter88, -1 %prol.iter88.cmp = icmp eq i64 %prol.iter88.sub, 0 - br i1 %prol.iter88.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !25 + br i1 %prol.iter88.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !28 vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol br label %vector.body.prol.loopexit @@ -423,62 +477,62 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.7, %vector.body ] %18 = getelementptr inbounds float, float* %2, i64 %index %19 = bitcast float* %18 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !20 %20 = getelementptr float, float* %18, i64 4 %21 = bitcast float* %20 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !20 %index.next = add i64 %index, 8 %22 = getelementptr inbounds float, float* %2, i64 %index.next %23 = bitcast float* %22 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !20 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !20 %index.next.1 = add i64 %index, 16 %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !20 %28 = getelementptr float, float* %26, i64 4 %29 = bitcast float* %28 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !20 %index.next.2 = add i64 %index, 24 %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 %31 = bitcast float* %30 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !20 %32 = getelementptr float, float* %30, i64 4 %33 = bitcast float* %32 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !20 %index.next.3 = add i64 %index, 32 %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 %35 = bitcast float* %34 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !20 %36 = getelementptr float, float* %34, i64 4 %37 = bitcast float* %36 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !20 %index.next.4 = add i64 %index, 40 %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !20 %40 = getelementptr float, float* %38, i64 4 %41 = bitcast float* %40 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !20 %index.next.5 = add i64 %index, 48 %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 %43 = bitcast float* %42 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !20 %44 = getelementptr float, float* %42, i64 4 %45 = bitcast float* %44 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !20 %index.next.6 = add i64 %index, 56 %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 %47 = bitcast float* %46 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !20 %48 = getelementptr float, float* %46, i64 4 %49 = bitcast float* %48 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !20 %index.next.7 = add i64 %index, 64 %50 = icmp eq i64 %index.next.7, %n.vec - br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !26 + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !29 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -556,14 +610,14 @@ vector.body49.prol: ; preds = %vector.body49.prol, %74 = add i64 %conv731, %index67.prol %75 = getelementptr inbounds float, float* %2, i64 %74 %76 = bitcast float* %75 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %76, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %76, align 4, !tbaa !20 %77 = getelementptr float, float* %75, i64 4 %78 = bitcast float* %77 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %78, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %78, align 4, !tbaa !20 %index.next68.prol = add i64 %index67.prol, 8 %prol.iter.sub = add i64 %prol.iter, -1 %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 - br i1 %prol.iter.cmp, label %vector.body49.prol.loopexit.unr-lcssa, label %vector.body49.prol, !llvm.loop !27 + br i1 %prol.iter.cmp, label %vector.body49.prol.loopexit.unr-lcssa, label %vector.body49.prol, !llvm.loop !30 vector.body49.prol.loopexit.unr-lcssa: ; preds = %vector.body49.prol br label %vector.body49.prol.loopexit @@ -581,37 +635,37 @@ vector.body49: ; preds = %vector.body49, %vec %80 = add i64 %conv731, %index67 %81 = getelementptr inbounds float, float* %2, i64 %80 %82 = bitcast float* %81 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %82, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %82, align 4, !tbaa !20 %83 = getelementptr float, float* %81, i64 4 %84 = bitcast float* %83 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %84, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %84, align 4, !tbaa !20 %index.next68 = add i64 %index67, 8 %85 = add i64 %conv731, %index.next68 %86 = getelementptr inbounds float, float* %2, i64 %85 %87 = bitcast float* %86 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %87, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %87, align 4, !tbaa !20 %88 = getelementptr float, float* %86, i64 4 %89 = bitcast float* %88 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %89, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %89, align 4, !tbaa !20 %index.next68.1 = add i64 %index67, 16 %90 = add i64 %conv731, %index.next68.1 %91 = getelementptr inbounds float, float* %2, i64 %90 %92 = bitcast float* %91 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %92, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %92, align 4, !tbaa !20 %93 = getelementptr float, float* %91, i64 4 %94 = bitcast float* %93 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %94, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %94, align 4, !tbaa !20 %index.next68.2 = add i64 %index67, 24 %95 = add i64 %conv731, %index.next68.2 %96 = getelementptr inbounds float, float* %2, i64 %95 %97 = bitcast float* %96 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %97, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %97, align 4, !tbaa !20 %98 = getelementptr float, float* %96, i64 4 %99 = bitcast float* %98 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %99, align 4, !tbaa !17 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %99, align 4, !tbaa !20 %index.next68.3 = add i64 %index67, 32 %100 = icmp eq i64 %index.next68.3, %n.vec55 - br i1 %100, label %middle.block50.unr-lcssa, label %vector.body49, !llvm.loop !28 + br i1 %100, label %middle.block50.unr-lcssa, label %vector.body49, !llvm.loop !31 middle.block50.unr-lcssa: ; preds = %vector.body49 br label %middle.block50 @@ -624,21 +678,21 @@ for.body: ; preds = %for.body.preheader8 %conv38 = phi i64 [ %conv, %for.body ], [ %conv38.ph, %for.body.preheader85 ] %i.037 = phi i32 [ %inc, %for.body ], [ %i.037.ph, %for.body.preheader85 ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv38 - store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !17 + store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !20 %inc = add i32 %i.037, 1 %conv = zext i32 %inc to i64 %cmp1 = icmp ult i64 %conv, %div35 - br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !29 + br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !32 for.body11: ; preds = %for.body11.preheader, %for.body11 %conv734 = phi i64 [ %conv7, %for.body11 ], [ %conv734.ph, %for.body11.preheader ] %i2.033 = phi i32 [ %inc15, %for.body11 ], [ %i2.033.ph, %for.body11.preheader ] %arrayidx13 = getelementptr inbounds float, float* %2, i64 %conv734 - store float 2.000000e+00, float* %arrayidx13, align 4, !tbaa !17 + store float 2.000000e+00, float* %arrayidx13, align 4, !tbaa !20 %inc15 = add i32 %i2.033, 1 %conv7 = zext i32 %inc15 to i64 %cmp9 = icmp ult i64 %conv7, %4 - br i1 %cmp9, label %for.body11, label %if.end.loopexit, !llvm.loop !30 + br i1 %cmp9, label %for.body11, label %if.end.loopexit, !llvm.loop !33 if.end.loopexit: ; preds = %for.body11 br label %if.end @@ -648,21 +702,186 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: nounwind uwtable -define void @_Z21fillTensorWithNegOnesPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z17fillTensorWithValPvf(i8* %tensor_ptr, float %target_value) local_unnamed_addr #3 { +entry: + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 + %data_type = bitcast i8* %tensor_ptr to i32* + %0 = load i32, i32* %data_type, align 8, !tbaa !19 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %1 = bitcast i8* %host_data to float** + %2 = load float*, float** %1, align 8, !tbaa !17 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 + %3 = bitcast i8* %num_elems to i64* + %4 = load i64, i64* %3, align 8, !tbaa !16 + %cmp110 = icmp eq i64 %4, 0 + br i1 %cmp110, label %if.end, label %for.body.preheader + +for.body.preheader: ; preds = %if.then + %min.iters.check = icmp ult i64 %4, 8 + br i1 %min.iters.check, label %for.body.preheader24, label %min.iters.checked + +min.iters.checked: ; preds = %for.body.preheader + %n.vec = and i64 %4, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + br i1 %cmp.zero, label %for.body.preheader24, label %vector.scevcheck + +vector.scevcheck: ; preds = %min.iters.checked + %5 = add i64 %4, -1 + %6 = trunc i64 %5 to i32 + %7 = icmp eq i32 %6, -1 + %8 = icmp ugt i64 %5, 4294967295 + %9 = or i1 %7, %8 + %cast.crd = trunc i64 %n.vec to i32 + br i1 %9, label %for.body.preheader24, label %vector.ph + +vector.ph: ; preds = %vector.scevcheck + %broadcast.splatinsert22 = insertelement <4 x float> undef, float %target_value, i32 0 + %broadcast.splat23 = shufflevector <4 x float> %broadcast.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer + %10 = add i64 %n.vec, -8 + %11 = lshr exact i64 %10, 3 + %12 = add nuw nsw i64 %11, 1 + %xtraiter = and i64 %12, 7 + %lcmp.mod = icmp eq i64 %xtraiter, 0 + br i1 %lcmp.mod, label %vector.body.prol.loopexit, label %vector.body.prol.preheader + +vector.body.prol.preheader: ; preds = %vector.ph + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol, %vector.body.prol.preheader + %index.prol = phi i64 [ 0, %vector.body.prol.preheader ], [ %index.next.prol, %vector.body.prol ] + %prol.iter = phi i64 [ %xtraiter, %vector.body.prol.preheader ], [ %prol.iter.sub, %vector.body.prol ] + %13 = getelementptr inbounds float, float* %2, i64 %index.prol + %14 = bitcast float* %13 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %14, align 4, !tbaa !20 + %15 = getelementptr float, float* %13, i64 4 + %16 = bitcast float* %15 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %16, align 4, !tbaa !20 + %index.next.prol = add i64 %index.prol, 8 + %prol.iter.sub = add i64 %prol.iter, -1 + %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 + br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !34 + +vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.ph, %vector.body.prol.loopexit.unr-lcssa + %index.unr = phi i64 [ 0, %vector.ph ], [ %index.next.prol, %vector.body.prol.loopexit.unr-lcssa ] + %17 = icmp ult i64 %10, 56 + br i1 %17, label %middle.block, label %vector.ph.new + +vector.ph.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph.new + %index = phi i64 [ %index.unr, %vector.ph.new ], [ %index.next.7, %vector.body ] + %18 = getelementptr inbounds float, float* %2, i64 %index + %19 = bitcast float* %18 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %19, align 4, !tbaa !20 + %20 = getelementptr float, float* %18, i64 4 + %21 = bitcast float* %20 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %21, align 4, !tbaa !20 + %index.next = add i64 %index, 8 + %22 = getelementptr inbounds float, float* %2, i64 %index.next + %23 = bitcast float* %22 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %23, align 4, !tbaa !20 + %24 = getelementptr float, float* %22, i64 4 + %25 = bitcast float* %24 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %25, align 4, !tbaa !20 + %index.next.1 = add i64 %index, 16 + %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 + %27 = bitcast float* %26 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %27, align 4, !tbaa !20 + %28 = getelementptr float, float* %26, i64 4 + %29 = bitcast float* %28 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %29, align 4, !tbaa !20 + %index.next.2 = add i64 %index, 24 + %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 + %31 = bitcast float* %30 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %31, align 4, !tbaa !20 + %32 = getelementptr float, float* %30, i64 4 + %33 = bitcast float* %32 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %33, align 4, !tbaa !20 + %index.next.3 = add i64 %index, 32 + %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 + %35 = bitcast float* %34 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %35, align 4, !tbaa !20 + %36 = getelementptr float, float* %34, i64 4 + %37 = bitcast float* %36 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %37, align 4, !tbaa !20 + %index.next.4 = add i64 %index, 40 + %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 + %39 = bitcast float* %38 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %39, align 4, !tbaa !20 + %40 = getelementptr float, float* %38, i64 4 + %41 = bitcast float* %40 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %41, align 4, !tbaa !20 + %index.next.5 = add i64 %index, 48 + %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 + %43 = bitcast float* %42 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %43, align 4, !tbaa !20 + %44 = getelementptr float, float* %42, i64 4 + %45 = bitcast float* %44 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %45, align 4, !tbaa !20 + %index.next.6 = add i64 %index, 56 + %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 + %47 = bitcast float* %46 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %47, align 4, !tbaa !20 + %48 = getelementptr float, float* %46, i64 4 + %49 = bitcast float* %48 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %49, align 4, !tbaa !20 + %index.next.7 = add i64 %index, 64 + %50 = icmp eq i64 %index.next.7, %n.vec + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !35 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %vector.body.prol.loopexit, %middle.block.unr-lcssa + %cmp.n = icmp eq i64 %4, %n.vec + br i1 %cmp.n, label %if.end, label %for.body.preheader24 + +for.body.preheader24: ; preds = %middle.block, %vector.scevcheck, %min.iters.checked, %for.body.preheader + %conv12.ph = phi i64 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %i.011.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %cast.crd, %middle.block ] + br label %for.body + +for.body: ; preds = %for.body.preheader24, %for.body + %conv12 = phi i64 [ %conv, %for.body ], [ %conv12.ph, %for.body.preheader24 ] + %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader24 ] + %arrayidx = getelementptr inbounds float, float* %2, i64 %conv12 + store float %target_value, float* %arrayidx, align 4, !tbaa !20 + %inc = add i32 %i.011, 1 + %conv = zext i32 %inc to i64 + %cmp1 = icmp ult i64 %conv, %4 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !36 + +if.end.loopexit: ; preds = %for.body + br label %if.end + +if.end: ; preds = %if.end.loopexit, %middle.block, %if.then, %entry + ret void +} + +; Function Attrs: nounwind uwtable +define void @_Z21fillTensorWithNegOnesPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !16 + %0 = load i32, i32* %data_type, align 8, !tbaa !19 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !15 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !17 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !13 + %4 = load i64, i64* %3, align 8, !tbaa !16 %cmp110 = icmp eq i64 %4, 0 br i1 %cmp110, label %if.end, label %for.body.preheader @@ -700,14 +919,14 @@ vector.body.prol: ; preds = %vector.body.prol, % %prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ] %13 = getelementptr inbounds float, float* %2, i64 %index.prol %14 = bitcast float* %13 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %14, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %14, align 4, !tbaa !20 %15 = getelementptr float, float* %13, i64 4 %16 = bitcast float* %15 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %16, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %16, align 4, !tbaa !20 %index.next.prol = add i64 %index.prol, 8 %prol.iter.sub = add i64 %prol.iter, -1 %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 - br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !31 + br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !37 vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol br label %vector.body.prol.loopexit @@ -724,62 +943,62 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.7, %vector.body ] %18 = getelementptr inbounds float, float* %2, i64 %index %19 = bitcast float* %18 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %19, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %19, align 4, !tbaa !20 %20 = getelementptr float, float* %18, i64 4 %21 = bitcast float* %20 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %21, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %21, align 4, !tbaa !20 %index.next = add i64 %index, 8 %22 = getelementptr inbounds float, float* %2, i64 %index.next %23 = bitcast float* %22 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %23, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %23, align 4, !tbaa !20 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %25, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %25, align 4, !tbaa !20 %index.next.1 = add i64 %index, 16 %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %27, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %27, align 4, !tbaa !20 %28 = getelementptr float, float* %26, i64 4 %29 = bitcast float* %28 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %29, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %29, align 4, !tbaa !20 %index.next.2 = add i64 %index, 24 %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 %31 = bitcast float* %30 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %31, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %31, align 4, !tbaa !20 %32 = getelementptr float, float* %30, i64 4 %33 = bitcast float* %32 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %33, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %33, align 4, !tbaa !20 %index.next.3 = add i64 %index, 32 %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 %35 = bitcast float* %34 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %35, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %35, align 4, !tbaa !20 %36 = getelementptr float, float* %34, i64 4 %37 = bitcast float* %36 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %37, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %37, align 4, !tbaa !20 %index.next.4 = add i64 %index, 40 %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %39, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %39, align 4, !tbaa !20 %40 = getelementptr float, float* %38, i64 4 %41 = bitcast float* %40 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %41, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %41, align 4, !tbaa !20 %index.next.5 = add i64 %index, 48 %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 %43 = bitcast float* %42 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %43, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %43, align 4, !tbaa !20 %44 = getelementptr float, float* %42, i64 4 %45 = bitcast float* %44 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %45, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %45, align 4, !tbaa !20 %index.next.6 = add i64 %index, 56 %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 %47 = bitcast float* %46 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %47, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %47, align 4, !tbaa !20 %48 = getelementptr float, float* %46, i64 4 %49 = bitcast float* %48 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %49, align 4, !tbaa !17 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %49, align 4, !tbaa !20 %index.next.7 = add i64 %index, 64 %50 = icmp eq i64 %index.next.7, %n.vec - br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !32 + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !38 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -797,11 +1016,11 @@ for.body: ; preds = %for.body.preheader2 %conv12 = phi i64 [ %conv, %for.body ], [ %conv12.ph, %for.body.preheader22 ] %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader22 ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv12 - store float -1.000000e+00, float* %arrayidx, align 4, !tbaa !17 + store float -1.000000e+00, float* %arrayidx, align 4, !tbaa !20 %inc = add i32 %i.011, 1 %conv = zext i32 %inc to i64 %cmp1 = icmp ult i64 %conv, %4 - br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !33 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !39 if.end.loopexit: ; preds = %for.body br label %if.end @@ -811,20 +1030,20 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: norecurse nounwind uwtable -define void @_Z14fillTensorValsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #5 { +define void @_Z14fillTensorValsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #6 { entry: %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !16 + %0 = load i32, i32* %data_type, align 8, !tbaa !19 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !15 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !17 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !13 + %4 = load i64, i64* %3, align 8, !tbaa !16 %cmp111 = icmp eq i64 %4, 0 br i1 %cmp111, label %if.end, label %for.body.preheader @@ -858,10 +1077,10 @@ vector.body.prol.preheader: ; preds = %vector.body.prehead vector.body.prol: ; preds = %vector.body.prol.preheader %13 = bitcast float* %2 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float>* %13, align 4, !tbaa !17 + store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float>* %13, align 4, !tbaa !20 %14 = getelementptr float, float* %2, i64 4 %15 = bitcast float* %14 to <4 x float>* - store <4 x float> <float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>, <4 x float>* %15, align 4, !tbaa !17 + store <4 x float> <float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>, <4 x float>* %15, align 4, !tbaa !20 br label %vector.body.prol.loopexit vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader @@ -883,10 +1102,10 @@ vector.body: ; preds = %vector.body, %vecto %21 = uitofp <4 x i32> %19 to <4 x float> %22 = getelementptr inbounds float, float* %2, i64 %index %23 = bitcast float* %22 to <4 x float>* - store <4 x float> %20, <4 x float>* %23, align 4, !tbaa !17 + store <4 x float> %20, <4 x float>* %23, align 4, !tbaa !20 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> %21, <4 x float>* %25, align 4, !tbaa !17 + store <4 x float> %21, <4 x float>* %25, align 4, !tbaa !20 %index.next = add i64 %index, 8 %26 = trunc i64 %index.next to i32 %broadcast.splatinsert19.1 = insertelement <4 x i32> undef, i32 %26, i32 0 @@ -897,13 +1116,13 @@ vector.body: ; preds = %vector.body, %vecto %30 = uitofp <4 x i32> %28 to <4 x float> %31 = getelementptr inbounds float, float* %2, i64 %index.next %32 = bitcast float* %31 to <4 x float>* - store <4 x float> %29, <4 x float>* %32, align 4, !tbaa !17 + store <4 x float> %29, <4 x float>* %32, align 4, !tbaa !20 %33 = getelementptr float, float* %31, i64 4 %34 = bitcast float* %33 to <4 x float>* - store <4 x float> %30, <4 x float>* %34, align 4, !tbaa !17 + store <4 x float> %30, <4 x float>* %34, align 4, !tbaa !20 %index.next.1 = add i64 %index, 16 %35 = icmp eq i64 %index.next.1, %n.vec - br i1 %35, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !34 + br i1 %35, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !40 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -923,10 +1142,10 @@ for.body: ; preds = %for.body.preheader2 %add = add i32 %i.012, 1 %conv2 = uitofp i32 %add to float %arrayidx = getelementptr inbounds float, float* %2, i64 %conv13 - store float %conv2, float* %arrayidx, align 4, !tbaa !17 + store float %conv2, float* %arrayidx, align 4, !tbaa !20 %conv = zext i32 %add to i64 %cmp1 = icmp ult i64 %conv, %4 - br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !35 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !41 if.end.loopexit: ; preds = %for.body br label %if.end @@ -936,21 +1155,21 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: nounwind uwtable -define void @_Z17printTensorValuesPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z17printTensorValuesPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !16 + %0 = load i32, i32* %data_type, align 8, !tbaa !19 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !15 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !17 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !13 + %4 = load i64, i64* %3, align 8, !tbaa !16 %cmp112 = icmp eq i64 %4, 0 br i1 %cmp112, label %if.end, label %for.body.preheader @@ -961,12 +1180,12 @@ for.body: ; preds = %for.body.preheader, %conv14 = phi i64 [ %conv, %for.body ], [ 0, %for.body.preheader ] %i.013 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv14 - %5 = load float, float* %arrayidx, align 4, !tbaa !17 + %5 = load float, float* %arrayidx, align 4, !tbaa !20 %conv2 = fpext float %5 to double - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.9, i64 0, i64 0), double %conv2) + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.8, i64 0, i64 0), double %conv2) %inc = add i32 %i.013, 1 %conv = zext i32 %inc to i64 - %6 = load i64, i64* %3, align 8, !tbaa !13 + %6 = load i64, i64* %3, align 8, !tbaa !16 %cmp1 = icmp ult i64 %conv, %6 br i1 %cmp1, label %for.body, label %if.end.loopexit @@ -974,25 +1193,25 @@ if.end.loopexit: ; preds = %for.body br label %if.end if.end: ; preds = %if.end.loopexit, %if.then, %entry - %putchar = tail call i32 @putchar(i32 10) #7 + %putchar = tail call i32 @putchar(i32 10) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z15printTensorDimsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #0 { +define void @_Z15printTensorDimsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #3 { entry: - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %0 = bitcast i8* %num_elems to i64* - %1 = load i64, i64* %0, align 8, !tbaa !13 - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.11, i64 0, i64 0), i64 %1) - %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 64 + %1 = load i64, i64* %0, align 8, !tbaa !16 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.10, i64 0, i64 0), i64 %1) + %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 88 %num_dims = bitcast i8* %dims to i32* - %2 = load i32, i32* %num_dims, align 8, !tbaa !10 + %2 = load i32, i32* %num_dims, align 8, !tbaa !13 %cmp10 = icmp sgt i32 %2, 0 br i1 %cmp10, label %for.body.lr.ph, label %for.cond.cleanup for.body.lr.ph: ; preds = %entry - %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 + %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 96 %3 = bitcast i8* %dim_sizes to i64** br label %for.body @@ -1004,32 +1223,32 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo for.body: ; preds = %for.body.lr.ph, %for.body %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %4 = load i64*, i64** %3, align 8, !tbaa !11 + %4 = load i64*, i64** %3, align 8, !tbaa !14 %arrayidx = getelementptr inbounds i64, i64* %4, i64 %indvars.iv - %5 = load i64, i64* %arrayidx, align 8, !tbaa !12 + %5 = load i64, i64* %arrayidx, align 8, !tbaa !15 %6 = trunc i64 %indvars.iv to i32 - %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.12, i64 0, i64 0), i32 %6, i64 %5) + %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.11, i64 0, i64 0), i32 %6, i64 %5) %indvars.iv.next = add nuw i64 %indvars.iv, 1 - %7 = load i32, i32* %num_dims, align 8, !tbaa !10 + %7 = load i32, i32* %num_dims, align 8, !tbaa !13 %8 = sext i32 %7 to i64 %cmp = icmp slt i64 %indvars.iv.next, %8 br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit } ; Function Attrs: nounwind uwtable -define void @_Z14compareTensorsPvS_(i8* %tensor1_ptr, i8* %tensor2_ptr) local_unnamed_addr #0 { +define void @_Z14compareTensorsPvS_(i8* %tensor1_ptr, i8* %tensor2_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor1_ptr, i32 0) #7 - tail call void @hpvm_request_tensor(i8* %tensor2_ptr, i32 0) #7 - %host_data = getelementptr inbounds i8, i8* %tensor1_ptr, i64 32 + tail call void @hpvm_request_tensor(i8* %tensor1_ptr, i32 0) #2 + tail call void @hpvm_request_tensor(i8* %tensor2_ptr, i32 0) #2 + %host_data = getelementptr inbounds i8, i8* %tensor1_ptr, i64 48 %0 = bitcast i8* %host_data to float** - %1 = load float*, float** %0, align 8, !tbaa !15 - %host_data1 = getelementptr inbounds i8, i8* %tensor2_ptr, i64 32 + %1 = load float*, float** %0, align 8, !tbaa !17 + %host_data1 = getelementptr inbounds i8, i8* %tensor2_ptr, i64 48 %2 = bitcast i8* %host_data1 to float** - %3 = load float*, float** %2, align 8, !tbaa !15 - %num_elems = getelementptr inbounds i8, i8* %tensor1_ptr, i64 48 + %3 = load float*, float** %2, align 8, !tbaa !17 + %num_elems = getelementptr inbounds i8, i8* %tensor1_ptr, i64 72 %4 = bitcast i8* %num_elems to i64* - %5 = load i64, i64* %4, align 8, !tbaa !13 + %5 = load i64, i64* %4, align 8, !tbaa !16 %cmp17 = icmp eq i64 %5, 0 br i1 %cmp17, label %for.cond.cleanup, label %for.body.preheader @@ -1046,15 +1265,15 @@ for.body: ; preds = %for.body.preheader, %conv19 = phi i64 [ %conv, %for.inc ], [ 0, %for.body.preheader ] %i.018 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds float, float* %1, i64 %conv19 - %6 = load float, float* %arrayidx, align 4, !tbaa !17 + %6 = load float, float* %arrayidx, align 4, !tbaa !20 %arrayidx3 = getelementptr inbounds float, float* %3, i64 %conv19 - %7 = load float, float* %arrayidx3, align 4, !tbaa !17 + %7 = load float, float* %arrayidx3, align 4, !tbaa !20 %cmp4 = fcmp fast une float %6, %7 br i1 %cmp4, label %if.then, label %for.inc if.then: ; preds = %for.body - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.13, i64 0, i64 0), i32 %i.018) - tail call void @abort() #8 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.12, i64 0, i64 0), i32 %i.018) + tail call void @abort() #13 unreachable for.inc: ; preds = %for.body @@ -1065,12 +1284,12 @@ for.inc: ; preds = %for.body } ; Function Attrs: nounwind uwtable -define void @_Z13compareValuesPvPfm(i8* %tensor_ptr, float* nocapture readonly %data, i64 %num_elems) local_unnamed_addr #0 { +define void @_Z13compareValuesPvPfm(i8* %tensor_ptr, float* nocapture readonly %data, i64 %num_elems) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %0 = bitcast i8* %host_data to float** - %1 = load float*, float** %0, align 8, !tbaa !15 + %1 = load float*, float** %0, align 8, !tbaa !17 %cmp11 = icmp eq i64 %num_elems, 0 br i1 %cmp11, label %for.cond.cleanup, label %for.body.preheader @@ -1092,21 +1311,21 @@ for.body: ; preds = %for.body.preheader, %conv13 = phi i64 [ %conv, %for.cond ], [ 0, %for.body.preheader ] %i.012 = phi i32 [ %inc, %for.cond ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds float, float* %1, i64 %conv13 - %2 = load float, float* %arrayidx, align 4, !tbaa !17 + %2 = load float, float* %arrayidx, align 4, !tbaa !20 %arrayidx2 = getelementptr inbounds float, float* %data, i64 %conv13 - %3 = load float, float* %arrayidx2, align 4, !tbaa !17 + %3 = load float, float* %arrayidx2, align 4, !tbaa !20 %cmp3 = fcmp fast une float %2, %3 %inc = add i32 %i.012, 1 br i1 %cmp3, label %if.then, label %for.cond if.then: ; preds = %for.body - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.14, i64 0, i64 0)) - tail call void @abort() #8 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.13, i64 0, i64 0)) + tail call void @abort() #13 unreachable } ; Function Attrs: nounwind uwtable -define i8* @_Z15readInputTensorPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #0 { +define i8* @_Z15readInputTensorPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #3 { entry: %mul = mul nsw i32 %dim2_size, %dim1_size %mul1 = mul nsw i32 %mul, %dim3_size @@ -1116,39 +1335,40 @@ entry: %mul5 = mul nsw i32 %mul4, %dim3_size %mul6 = mul nsw i32 %mul5, %dim4_size %conv = sext i32 %mul2 to i64 - %call = tail call noalias i8* @malloc(i64 %conv) #7 + %call = tail call noalias i8* @malloc(i64 %conv) #2 %mul9 = shl nsw i64 %conv, 2 - %call10 = tail call noalias i8* @malloc(i64 %mul9) #7 + %call10 = tail call noalias i8* @malloc(i64 %mul9) #2 %0 = bitcast i8* %call10 to float* - %call11 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) + %call11 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) %cmp = icmp eq %struct._IO_FILE* %call11, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call12 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.16, i64 0, i64 0), i8* %file_name) - tail call void @abort() #8 + %call12 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable if.end: ; preds = %entry %call14 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call11, i64 16, i32 1) %call17 = tail call i64 @fread(i8* %call, i64 1, i64 %conv, %struct._IO_FILE* nonnull %call11) - %cmp1962 = icmp eq i32 %mul2, 0 - br i1 %cmp1962, label %for.cond.cleanup, label %for.body.preheader + %call18 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call11) + %cmp2060 = icmp eq i32 %mul2, 0 + br i1 %cmp2060, label %for.cond.cleanup, label %for.body.preheader for.body.preheader: ; preds = %if.end %1 = icmp ugt i64 %conv, 1 %umax = select i1 %1, i64 %conv, i64 1 %min.iters.check = icmp ult i64 %umax, 8 - br i1 %min.iters.check, label %for.body.preheader68, label %min.iters.checked + br i1 %min.iters.check, label %for.body.preheader64, label %min.iters.checked -for.body.preheader68: ; preds = %middle.block, %min.iters.checked, %for.body.preheader - %i.063.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] +for.body.preheader64: ; preds = %middle.block, %min.iters.checked, %for.body.preheader + %i.061.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] br label %for.body min.iters.checked: ; preds = %for.body.preheader %n.vec = and i64 %umax, -8 %cmp.zero = icmp eq i64 %n.vec, 0 - br i1 %cmp.zero, label %for.body.preheader68, label %vector.body.preheader + br i1 %cmp.zero, label %for.body.preheader64, label %vector.body.preheader vector.body.preheader: ; preds = %min.iters.checked %2 = add nsw i64 %n.vec, -8 @@ -1162,19 +1382,19 @@ vector.body.prol.preheader: ; preds = %vector.body.prehead vector.body.prol: ; preds = %vector.body.prol.preheader %5 = bitcast i8* %call to <4 x i8>* - %wide.load.prol = load <4 x i8>, <4 x i8>* %5, align 1, !tbaa !36 + %wide.load.prol = load <4 x i8>, <4 x i8>* %5, align 1, !tbaa !42 %6 = getelementptr i8, i8* %call, i64 4 %7 = bitcast i8* %6 to <4 x i8>* - %wide.load67.prol = load <4 x i8>, <4 x i8>* %7, align 1, !tbaa !36 + %wide.load63.prol = load <4 x i8>, <4 x i8>* %7, align 1, !tbaa !42 %8 = uitofp <4 x i8> %wide.load.prol to <4 x float> - %9 = uitofp <4 x i8> %wide.load67.prol to <4 x float> + %9 = uitofp <4 x i8> %wide.load63.prol to <4 x float> %10 = fmul fast <4 x float> %8, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %11 = fmul fast <4 x float> %9, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %12 = bitcast i8* %call10 to <4 x float>* - store <4 x float> %10, <4 x float>* %12, align 4, !tbaa !17 + store <4 x float> %10, <4 x float>* %12, align 4, !tbaa !20 %13 = getelementptr i8, i8* %call10, i64 16 %14 = bitcast i8* %13 to <4 x float>* - store <4 x float> %11, <4 x float>* %14, align 4, !tbaa !17 + store <4 x float> %11, <4 x float>* %14, align 4, !tbaa !20 br label %vector.body.prol.loopexit vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader @@ -1189,216 +1409,385 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] %16 = getelementptr inbounds i8, i8* %call, i64 %index %17 = bitcast i8* %16 to <4 x i8>* - %wide.load = load <4 x i8>, <4 x i8>* %17, align 1, !tbaa !36 + %wide.load = load <4 x i8>, <4 x i8>* %17, align 1, !tbaa !42 %18 = getelementptr i8, i8* %16, i64 4 %19 = bitcast i8* %18 to <4 x i8>* - %wide.load67 = load <4 x i8>, <4 x i8>* %19, align 1, !tbaa !36 + %wide.load63 = load <4 x i8>, <4 x i8>* %19, align 1, !tbaa !42 %20 = uitofp <4 x i8> %wide.load to <4 x float> - %21 = uitofp <4 x i8> %wide.load67 to <4 x float> + %21 = uitofp <4 x i8> %wide.load63 to <4 x float> %22 = fmul fast <4 x float> %20, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %23 = fmul fast <4 x float> %21, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %24 = getelementptr inbounds float, float* %0, i64 %index %25 = bitcast float* %24 to <4 x float>* - store <4 x float> %22, <4 x float>* %25, align 4, !tbaa !17 + store <4 x float> %22, <4 x float>* %25, align 4, !tbaa !20 %26 = getelementptr float, float* %24, i64 4 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> %23, <4 x float>* %27, align 4, !tbaa !17 + store <4 x float> %23, <4 x float>* %27, align 4, !tbaa !20 %index.next = add i64 %index, 8 %28 = getelementptr inbounds i8, i8* %call, i64 %index.next %29 = bitcast i8* %28 to <4 x i8>* - %wide.load.1 = load <4 x i8>, <4 x i8>* %29, align 1, !tbaa !36 + %wide.load.1 = load <4 x i8>, <4 x i8>* %29, align 1, !tbaa !42 %30 = getelementptr i8, i8* %28, i64 4 %31 = bitcast i8* %30 to <4 x i8>* - %wide.load67.1 = load <4 x i8>, <4 x i8>* %31, align 1, !tbaa !36 + %wide.load63.1 = load <4 x i8>, <4 x i8>* %31, align 1, !tbaa !42 %32 = uitofp <4 x i8> %wide.load.1 to <4 x float> - %33 = uitofp <4 x i8> %wide.load67.1 to <4 x float> + %33 = uitofp <4 x i8> %wide.load63.1 to <4 x float> %34 = fmul fast <4 x float> %32, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %35 = fmul fast <4 x float> %33, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %36 = getelementptr inbounds float, float* %0, i64 %index.next %37 = bitcast float* %36 to <4 x float>* - store <4 x float> %34, <4 x float>* %37, align 4, !tbaa !17 + store <4 x float> %34, <4 x float>* %37, align 4, !tbaa !20 %38 = getelementptr float, float* %36, i64 4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> %35, <4 x float>* %39, align 4, !tbaa !17 + store <4 x float> %35, <4 x float>* %39, align 4, !tbaa !20 %index.next.1 = add i64 %index, 16 %40 = icmp eq i64 %index.next.1, %n.vec - br i1 %40, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !37 + br i1 %40, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !43 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block middle.block: ; preds = %vector.body.prol.loopexit, %middle.block.unr-lcssa %cmp.n = icmp eq i64 %umax, %n.vec - br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader68 + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader64 -for.cond.cleanup.loopexit.loopexit: ; preds = %for.body - br label %for.cond.cleanup.loopexit - -for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.loopexit.loopexit, %middle.block - %arrayidx22.phi.trans.insert = getelementptr inbounds i8, i8* %call10, i64 40 - %.phi.trans.insert = bitcast i8* %arrayidx22.phi.trans.insert to float* - %.pre = load float, float* %.phi.trans.insert, align 4, !tbaa !17 - %phitmp = fpext float %.pre to double +for.cond.cleanup.loopexit: ; preds = %for.body br label %for.cond.cleanup -for.cond.cleanup: ; preds = %if.end, %for.cond.cleanup.loopexit - %41 = phi double [ %phitmp, %for.cond.cleanup.loopexit ], [ undef, %if.end ] - %call24 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.17, i64 0, i64 0), i32 10, double %41) - %conv25 = sext i32 %dim1_size to i64 - %conv26 = sext i32 %dim2_size to i64 - %conv27 = sext i32 %dim3_size to i64 - %conv28 = sext i32 %dim4_size to i64 - %call29 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv25, i64 %conv26, i64 %conv27, i64 %conv28) #7 - %conv30 = sext i32 %mul6 to i64 - tail call void @initTensorData(i8* %call29, i8* %call10, i64 %conv30) #7 - ret i8* %call29 - -for.body: ; preds = %for.body.preheader68, %for.body - %i.063 = phi i64 [ %inc, %for.body ], [ %i.063.ph, %for.body.preheader68 ] - %arrayidx = getelementptr inbounds i8, i8* %call, i64 %i.063 - %42 = load i8, i8* %arrayidx, align 1, !tbaa !36 - %conv20 = uitofp i8 %42 to float - %div = fmul fast float %conv20, 0x3F70101020000000 - %arrayidx21 = getelementptr inbounds float, float* %0, i64 %i.063 - store float %div, float* %arrayidx21, align 4, !tbaa !17 - %inc = add nuw i64 %i.063, 1 - %cmp19 = icmp ult i64 %inc, %conv - br i1 %cmp19, label %for.body, label %for.cond.cleanup.loopexit.loopexit, !llvm.loop !38 +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %middle.block, %if.end + %conv23 = sext i32 %dim1_size to i64 + %conv24 = sext i32 %dim2_size to i64 + %conv25 = sext i32 %dim3_size to i64 + %conv26 = sext i32 %dim4_size to i64 + %call27 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv23, i64 %conv24, i64 %conv25, i64 %conv26) #2 + %conv28 = sext i32 %mul6 to i64 + tail call void @initTensorData(i8* %call27, i8* %call10, i64 %conv28) #2 + ret i8* %call27 + +for.body: ; preds = %for.body.preheader64, %for.body + %i.061 = phi i64 [ %inc, %for.body ], [ %i.061.ph, %for.body.preheader64 ] + %arrayidx = getelementptr inbounds i8, i8* %call, i64 %i.061 + %41 = load i8, i8* %arrayidx, align 1, !tbaa !42 + %conv21 = uitofp i8 %41 to float + %div = fmul fast float %conv21, 0x3F70101020000000 + %arrayidx22 = getelementptr inbounds float, float* %0, i64 %i.061 + store float %div, float* %arrayidx22, align 4, !tbaa !20 + %inc = add nuw i64 %i.061, 1 + %cmp20 = icmp ult i64 %inc, %conv + br i1 %cmp20, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !44 } ; Function Attrs: nounwind -declare noalias i8* @malloc(i64) local_unnamed_addr #2 +declare noalias i8* @malloc(i64) local_unnamed_addr #1 ; Function Attrs: nounwind -declare i32 @fseek(%struct._IO_FILE* nocapture, i64, i32) local_unnamed_addr #2 +declare i32 @fseek(%struct._IO_FILE* nocapture, i64, i32) local_unnamed_addr #1 ; Function Attrs: nounwind -declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #2 +declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #1 -declare i8* @create4DTensor(i32, i32, i64, i64, i64, i64) local_unnamed_addr #3 +declare i8* @create4DTensor(i32, i32, i64, i64, i64, i64) local_unnamed_addr #0 -declare void @initTensorData(i8*, i8*, i64) local_unnamed_addr #3 +declare void @initTensorData(i8*, i8*, i64) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -define %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #0 { +define %struct.Tensor* @_Z21readTrainedWeightsCPUPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #3 { entry: %mul = mul nsw i32 %dim2_size, %dim1_size %mul1 = mul nsw i32 %mul, %dim3_size %mul2 = mul nsw i32 %mul1, %dim4_size %conv = sext i32 %mul2 to i64 - %mul7 = shl nsw i64 %conv, 2 - %call = tail call noalias i8* @malloc(i64 %mul7) #7 - %call8 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) - %cmp = icmp eq %struct._IO_FILE* %call8, null + %mul3 = shl i32 %dim1_size, 2 + %mul4 = mul nsw i32 %mul3, %dim2_size + %mul5 = mul nsw i32 %mul4, %dim3_size + %mul6 = mul nsw i32 %mul5, %dim4_size + %conv7 = sext i32 %mul6 to i64 + %mul8 = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul8) #2 + %call9 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call9, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call9 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.16, i64 0, i64 0), i8* %file_name) - tail call void @abort() #8 + %call10 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable if.end: ; preds = %entry - %0 = bitcast i8* %call to float* - %mul3 = shl i32 %dim1_size, 2 + %call12 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call9, i64 0, i32 1) + %call13 = tail call i64 @fread(i8* %call, i64 1, i64 %conv7, %struct._IO_FILE* nonnull %call9) + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.16, i64 0, i64 0), i64 %conv7, i64 %call13) + %call15 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call9) + %conv16 = sext i32 %dim1_size to i64 + %conv17 = sext i32 %dim2_size to i64 + %conv18 = sext i32 %dim3_size to i64 + %conv19 = sext i32 %dim4_size to i64 + %call20 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv16, i64 %conv17, i64 %conv18, i64 %conv19) #2 + %0 = bitcast i8* %call20 to %struct.Tensor* + tail call void @initTensorData(i8* %call20, i8* %call, i64 %conv7) #2 + tail call void @free(i8* %call) #2 + ret %struct.Tensor* %0 +} + +; Function Attrs: nounwind +declare void @free(i8* nocapture) local_unnamed_addr #1 + +; Function Attrs: nounwind uwtable +define %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %file_name, i32 %data_type, i64 %dim1_size, i64 %dim2_size, i64 %dim3_size, i64 %dim4_size) local_unnamed_addr #3 { +entry: + %mul3 = shl i64 %dim1_size, 2 + %mul4 = mul i64 %mul3, %dim2_size + %mul5 = mul i64 %mul4, %dim3_size + %mul6 = mul i64 %mul5, %dim4_size + %call = tail call noalias i8* @malloc(i64 %mul6) #2 + %call8 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.17, i64 0, i64 0), i64 %mul6) + %call9 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call9, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call10 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %call12 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call9, i64 0, i32 1) + %call13 = tail call i64 @fread(i8* %call, i64 1, i64 %mul6, %struct._IO_FILE* nonnull %call9) + %call14 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call9) + %call15 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %dim1_size, i64 %dim2_size, i64 %dim3_size, i64 %dim4_size) #2 + %0 = bitcast i8* %call15 to %struct.Tensor* + tail call void @initTensorData(i8* %call15, i8* %call, i64 %mul6) #2 + tail call void @free(i8* %call) #2 + ret %struct.Tensor* %0 +} + +; Function Attrs: nounwind uwtable +define %struct.Tensor* @_Z14readInputBatchPKciiiiii(i8* %file_name, i32 %data_type, i32 %start, i32 %end, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #3 { +entry: + %sub = sub nsw i32 %end, %start + %mul = mul nsw i32 %sub, %dim2_size + %mul1 = mul nsw i32 %mul, %dim3_size + %mul2 = mul nsw i32 %mul1, %dim4_size + %conv = sext i32 %mul2 to i64 + %mul3 = shl i32 %sub, 2 %mul4 = mul nsw i32 %mul3, %dim2_size %mul5 = mul nsw i32 %mul4, %dim3_size %mul6 = mul nsw i32 %mul5, %dim4_size - %call11 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call8, i64 0, i32 1) - %conv12 = sext i32 %mul6 to i64 - %call13 = tail call i64 @fread(i8* %call, i64 1, i64 %conv12, %struct._IO_FILE* nonnull %call8) - %conv14 = sext i32 %dim1_size to i64 - %conv15 = sext i32 %dim2_size to i64 - %conv16 = sext i32 %dim3_size to i64 - %conv17 = sext i32 %dim4_size to i64 - %call18 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv14, i64 %conv15, i64 %conv16, i64 %conv17) #7 - %1 = bitcast i8* %call18 to %struct.Tensor* - tail call void @initTensorData(i8* %call18, i8* %call, i64 %conv12) #7 - tail call void @hpvm_request_tensor(i8* %call18, i32 0) #7 - %host_data.i = getelementptr inbounds i8, i8* %call18, i64 32 - %2 = bitcast i8* %host_data.i to float** - %3 = load float*, float** %2, align 8, !tbaa !15 - %cmp11.i = icmp eq i32 %mul2, 0 - br i1 %cmp11.i, label %_Z13compareValuesPvPfm.exit, label %for.body.i.preheader - -for.body.i.preheader: ; preds = %if.end - br label %for.body.i - -for.cond.i: ; preds = %for.body.i - %conv.i = zext i32 %inc.i to i64 - %cmp.i = icmp ult i64 %conv.i, %conv - br i1 %cmp.i, label %for.body.i, label %_Z13compareValuesPvPfm.exit.loopexit - -for.body.i: ; preds = %for.body.i.preheader, %for.cond.i - %conv13.i = phi i64 [ %conv.i, %for.cond.i ], [ 0, %for.body.i.preheader ] - %i.012.i = phi i32 [ %inc.i, %for.cond.i ], [ 0, %for.body.i.preheader ] - %arrayidx.i = getelementptr inbounds float, float* %3, i64 %conv13.i - %4 = load float, float* %arrayidx.i, align 4, !tbaa !17 - %arrayidx2.i = getelementptr inbounds float, float* %0, i64 %conv13.i - %5 = load float, float* %arrayidx2.i, align 4, !tbaa !17 - %cmp3.i = fcmp fast une float %4, %5 - %inc.i = add i32 %i.012.i, 1 - br i1 %cmp3.i, label %if.then.i, label %for.cond.i - -if.then.i: ; preds = %for.body.i - %call.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.14, i64 0, i64 0)) #7 - tail call void @abort() #8 + %conv7 = sext i32 %mul6 to i64 + %mul8 = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul8) #2 + %call13 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call13, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable -_Z13compareValuesPvPfm.exit.loopexit: ; preds = %for.cond.i - br label %_Z13compareValuesPvPfm.exit +if.end: ; preds = %entry + %mul9 = shl i32 %start, 2 + %mul10 = mul nsw i32 %mul9, %dim2_size + %mul11 = mul nsw i32 %mul10, %dim3_size + %mul12 = mul nsw i32 %mul11, %dim4_size + %conv15 = sext i32 %mul12 to i64 + %call16 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call13, i64 %conv15, i32 0) + %call17 = tail call i64 @fread(i8* %call, i64 1, i64 %conv7, %struct._IO_FILE* nonnull %call13) + %call18 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call13) + %conv19 = sext i32 %sub to i64 + %conv20 = sext i32 %dim2_size to i64 + %conv21 = sext i32 %dim3_size to i64 + %conv22 = sext i32 %dim4_size to i64 + %call23 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv19, i64 %conv20, i64 %conv21, i64 %conv22) #2 + %0 = bitcast i8* %call23 to %struct.Tensor* + tail call void @initTensorData(i8* %call23, i8* %call, i64 %conv7) #2 + tail call void @free(i8* %call) #2 + ret %struct.Tensor* %0 +} + +; Function Attrs: nounwind uwtable +define i8* @_Z14copyInputBatchPKciiiiiPv(i8* %file_name, i32 %start, i32 %end, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size, i8* returned %inputTensor_ptr) local_unnamed_addr #3 { +entry: + %0 = bitcast i8* %inputTensor_ptr to %struct.Tensor* + %sub = sub nsw i32 %end, %start + %mul = mul nsw i32 %sub, %dim2_size + %mul1 = mul nsw i32 %mul, %dim3_size + %mul2 = mul nsw i32 %mul1, %dim4_size + %conv = sext i32 %mul2 to i64 + %mul3 = shl i32 %sub, 2 + %mul4 = mul nsw i32 %mul3, %dim2_size + %mul5 = mul nsw i32 %mul4, %dim3_size + %mul6 = mul nsw i32 %mul5, %dim4_size + %conv7 = sext i32 %mul6 to i64 + %mul8 = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul8) #2 + %call13 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call13, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 + unreachable -_Z13compareValuesPvPfm.exit: ; preds = %_Z13compareValuesPvPfm.exit.loopexit, %if.end - ret %struct.Tensor* %1 +if.end: ; preds = %entry + %mul9 = shl i32 %start, 2 + %mul10 = mul nsw i32 %mul9, %dim2_size + %mul11 = mul nsw i32 %mul10, %dim3_size + %mul12 = mul nsw i32 %mul11, %dim4_size + %conv15 = sext i32 %mul12 to i64 + %call16 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call13, i64 %conv15, i32 0) + %call17 = tail call i64 @fread(i8* %call, i64 1, i64 %conv7, %struct._IO_FILE* nonnull %call13) + %call18 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call13) + tail call void @initTensorData(i8* %inputTensor_ptr, i8* %call, i64 %conv7) #2 + tail call void @free(i8* %call) #2 + %dims = getelementptr inbounds i8, i8* %inputTensor_ptr, i64 88 + %num_dims = bitcast i8* %dims to i32* + %1 = load i32, i32* %num_dims, align 8, !tbaa !13 + %call19 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.18, i64 0, i64 0), i32 %1) + %host_data = getelementptr inbounds i8, i8* %inputTensor_ptr, i64 48 + %2 = bitcast i8* %host_data to i8** + %3 = load i8*, i8** %2, align 8, !tbaa !17 + %cmp20 = icmp eq i8* %3, null + br i1 %cmp20, label %if.then22, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %gpu_data = getelementptr inbounds i8, i8* %inputTensor_ptr, i64 56 + %4 = bitcast i8* %gpu_data to i8** + %5 = load i8*, i8** %4, align 8, !tbaa !7 + %cmp21 = icmp eq i8* %5, null + br i1 %cmp21, label %if.then22, label %if.end24 + +if.then22: ; preds = %lor.lhs.false, %if.end + %puts = tail call i32 @puts(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str.78, i64 0, i64 0)) + br label %if.end24 + +if.end24: ; preds = %if.then22, %lor.lhs.false + tail call void @changeTensorPlacement(%struct.Tensor* nonnull %0, i32 0) #2 + ret i8* %inputTensor_ptr } +declare void @changeTensorPlacement(%struct.Tensor*, i32) local_unnamed_addr #0 + ; Function Attrs: nounwind uwtable -define noalias i8* @_Z10readLabelsPKci(i8* %labels_file, i32 %num_labels) local_unnamed_addr #0 { +define noalias i8* @_Z10readLabelsPKci(i8* %labels_file, i32 %num_labels) local_unnamed_addr #3 { entry: %conv = sext i32 %num_labels to i64 - %call = tail call noalias i8* @malloc(i64 %conv) #7 - %call1 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) + %call = tail call noalias i8* @malloc(i64 %conv) #2 + %call1 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) %cmp = icmp eq %struct._IO_FILE* %call1, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.18, i64 0, i64 0), i8* %labels_file) - tail call void @abort() #8 + %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 unreachable if.end: ; preds = %entry %call5 = tail call i64 @fread(i8* %call, i64 1, i64 %conv, %struct._IO_FILE* nonnull %call1) - %call6 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.19, i64 0, i64 0), i64 %call5) + %call6 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call1) + ret i8* %call +} + +; Function Attrs: nounwind uwtable +define noalias i32* @_Z11readLabels3PKci(i8* %labels_file, i32 %num_labels) local_unnamed_addr #3 { +entry: + %conv = sext i32 %num_labels to i64 + %mul = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul) #2 + %call1 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call1, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %0 = bitcast i8* %call to i32* + %call5 = tail call i64 @fread(i8* %call, i64 1, i64 %mul, %struct._IO_FILE* nonnull %call1) + %call6 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call1) + ret i32* %0 +} + +; Function Attrs: nounwind uwtable +define noalias i8* @_Z15readLabelsBatchPKcii(i8* %labels_file, i32 %start, i32 %end) local_unnamed_addr #3 { +entry: + %sub = sub nsw i32 %end, %start + %conv2 = sext i32 %sub to i64 + %call = tail call noalias i8* @malloc(i64 %conv2) #2 + %call4 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call4, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %conv = sext i32 %start to i64 + %call7 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call4, i64 %conv, i32 0) + %call10 = tail call i64 @fread(i8* %call, i64 1, i64 %conv2, %struct._IO_FILE* nonnull %call4) + %call11 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call4) ret i8* %call } ; Function Attrs: nounwind uwtable -define void @_Z15computeAccuracyPciPv(i8* %labels_file, i32 %num_labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #0 { +define noalias i32* @_Z16readLabelsBatch3PKcii(i8* %labels_file, i32 %start, i32 %end) local_unnamed_addr #3 { +entry: + %sub = sub nsw i32 %end, %start + %conv2 = sext i32 %sub to i64 + %mul3 = shl nsw i64 %conv2, 2 + %call = tail call noalias i8* @malloc(i64 %mul3) #2 + %call4 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call4, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %0 = bitcast i8* %call to i32* + %mul = shl i32 %start, 2 + %conv6 = sext i32 %mul to i64 + %call7 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call4, i64 %conv6, i32 0) + %call10 = tail call i64 @fread(i8* %call, i64 1, i64 %mul3, %struct._IO_FILE* nonnull %call4) + %call11 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call4) + ret i32* %0 +} + +; Function Attrs: nounwind uwtable +define void @_Z15computeAccuracyPKciPv(i8* %labels_file, i32 %num_labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #3 { entry: %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 %print_str = alloca %"class.std::__cxx11::basic_string", align 8 %conv.i = sext i32 %num_labels to i64 - %call.i = tail call noalias i8* @malloc(i64 %conv.i) #7 - %call1.i = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) #7 + %call.i = tail call noalias i8* @malloc(i64 %conv.i) #2 + %call1.i = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) #2 %cmp.i = icmp eq %struct._IO_FILE* %call1.i, null br i1 %cmp.i, label %if.then.i, label %_Z10readLabelsPKci.exit if.then.i: ; preds = %entry - %call2.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.18, i64 0, i64 0), i8* %labels_file) #7 - tail call void @abort() #8 + %call2.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) #2 + tail call void @abort() #13 unreachable _Z10readLabelsPKci.exit: ; preds = %entry - %call5.i = tail call i64 @fread(i8* %call.i, i64 1, i64 %conv.i, %struct._IO_FILE* nonnull %call1.i) #7 - %call6.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.19, i64 0, i64 0), i64 %call5.i) #7 - %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 72 + %call5.i = tail call i64 @fread(i8* %call.i, i64 1, i64 %conv.i, %struct._IO_FILE* nonnull %call1.i) #2 + %call6.i = tail call i32 @fclose(%struct._IO_FILE* nonnull %call1.i) #2 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 %0 = bitcast i8* %dim_sizes to i64** - %1 = load i64*, i64** %0, align 8, !tbaa !11 - %2 = load i64, i64* %1, align 8, !tbaa !12 + %1 = load i64*, i64** %0, align 8, !tbaa !14 + %2 = load i64, i64* %1, align 8, !tbaa !15 %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 - %3 = load i64, i64* %arrayidx3, align 8, !tbaa !12 - %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 32 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !15 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 %4 = bitcast i8* %host_data to float** - %5 = load float*, float** %4, align 8, !tbaa !15 + %5 = load float*, float** %4, align 8, !tbaa !17 %cmp92 = icmp eq i64 %2, 0 br i1 %cmp92, label %for.cond.cleanup, label %for.cond4.preheader.preheader @@ -1410,86 +1799,86 @@ for.cond4.preheader: ; preds = %for.cond4.preheader %num_errors.094 = phi i32 [ %num_errors.0.inc21, %for.cond4.preheader ], [ 0, %for.cond4.preheader.preheader ] %mul = mul i64 %indvars.iv, %3 %arrayidx10 = getelementptr inbounds float, float* %5, i64 %mul - %6 = load float, float* %arrayidx10, align 4, !tbaa !17 + %6 = load float, float* %arrayidx10, align 4, !tbaa !20 %add14 = add i64 %mul, 1 %arrayidx15 = getelementptr inbounds float, float* %5, i64 %add14 - %7 = load float, float* %arrayidx15, align 4, !tbaa !17 + %7 = load float, float* %arrayidx15, align 4, !tbaa !20 %cmp16 = fcmp fast olt float %6, %7 %chosen.1 = zext i1 %cmp16 to i32 %conv9.1 = zext i1 %cmp16 to i64 %add.1 = add i64 %conv9.1, %mul %arrayidx10.1 = getelementptr inbounds float, float* %5, i64 %add.1 - %8 = load float, float* %arrayidx10.1, align 4, !tbaa !17 + %8 = load float, float* %arrayidx10.1, align 4, !tbaa !20 %add14.1 = add i64 %mul, 2 %arrayidx15.1 = getelementptr inbounds float, float* %5, i64 %add14.1 - %9 = load float, float* %arrayidx15.1, align 4, !tbaa !17 + %9 = load float, float* %arrayidx15.1, align 4, !tbaa !20 %cmp16.1 = fcmp fast olt float %8, %9 %chosen.1.1 = select i1 %cmp16.1, i32 2, i32 %chosen.1 %conv9.296 = zext i32 %chosen.1.1 to i64 %add.2 = add i64 %conv9.296, %mul %arrayidx10.2 = getelementptr inbounds float, float* %5, i64 %add.2 - %10 = load float, float* %arrayidx10.2, align 4, !tbaa !17 + %10 = load float, float* %arrayidx10.2, align 4, !tbaa !20 %add14.2 = add i64 %mul, 3 %arrayidx15.2 = getelementptr inbounds float, float* %5, i64 %add14.2 - %11 = load float, float* %arrayidx15.2, align 4, !tbaa !17 + %11 = load float, float* %arrayidx15.2, align 4, !tbaa !20 %cmp16.2 = fcmp fast olt float %10, %11 %chosen.1.2 = select i1 %cmp16.2, i32 3, i32 %chosen.1.1 %conv9.397 = zext i32 %chosen.1.2 to i64 %add.3 = add i64 %conv9.397, %mul %arrayidx10.3 = getelementptr inbounds float, float* %5, i64 %add.3 - %12 = load float, float* %arrayidx10.3, align 4, !tbaa !17 + %12 = load float, float* %arrayidx10.3, align 4, !tbaa !20 %add14.3 = add i64 %mul, 4 %arrayidx15.3 = getelementptr inbounds float, float* %5, i64 %add14.3 - %13 = load float, float* %arrayidx15.3, align 4, !tbaa !17 + %13 = load float, float* %arrayidx15.3, align 4, !tbaa !20 %cmp16.3 = fcmp fast olt float %12, %13 %chosen.1.3 = select i1 %cmp16.3, i32 4, i32 %chosen.1.2 %conv9.498 = zext i32 %chosen.1.3 to i64 %add.4 = add i64 %conv9.498, %mul %arrayidx10.4 = getelementptr inbounds float, float* %5, i64 %add.4 - %14 = load float, float* %arrayidx10.4, align 4, !tbaa !17 + %14 = load float, float* %arrayidx10.4, align 4, !tbaa !20 %add14.4 = add i64 %mul, 5 %arrayidx15.4 = getelementptr inbounds float, float* %5, i64 %add14.4 - %15 = load float, float* %arrayidx15.4, align 4, !tbaa !17 + %15 = load float, float* %arrayidx15.4, align 4, !tbaa !20 %cmp16.4 = fcmp fast olt float %14, %15 %chosen.1.4 = select i1 %cmp16.4, i32 5, i32 %chosen.1.3 %conv9.599 = zext i32 %chosen.1.4 to i64 %add.5 = add i64 %conv9.599, %mul %arrayidx10.5 = getelementptr inbounds float, float* %5, i64 %add.5 - %16 = load float, float* %arrayidx10.5, align 4, !tbaa !17 + %16 = load float, float* %arrayidx10.5, align 4, !tbaa !20 %add14.5 = add i64 %mul, 6 %arrayidx15.5 = getelementptr inbounds float, float* %5, i64 %add14.5 - %17 = load float, float* %arrayidx15.5, align 4, !tbaa !17 + %17 = load float, float* %arrayidx15.5, align 4, !tbaa !20 %cmp16.5 = fcmp fast olt float %16, %17 %chosen.1.5 = select i1 %cmp16.5, i32 6, i32 %chosen.1.4 %18 = zext i32 %chosen.1.5 to i64 %add.6 = add i64 %18, %mul %arrayidx10.6 = getelementptr inbounds float, float* %5, i64 %add.6 - %19 = load float, float* %arrayidx10.6, align 4, !tbaa !17 + %19 = load float, float* %arrayidx10.6, align 4, !tbaa !20 %add14.6 = add i64 %mul, 7 %arrayidx15.6 = getelementptr inbounds float, float* %5, i64 %add14.6 - %20 = load float, float* %arrayidx15.6, align 4, !tbaa !17 + %20 = load float, float* %arrayidx15.6, align 4, !tbaa !20 %cmp16.6 = fcmp fast olt float %19, %20 %chosen.1.6 = select i1 %cmp16.6, i32 7, i32 %chosen.1.5 %conv9.7 = sext i32 %chosen.1.6 to i64 %add.7 = add i64 %conv9.7, %mul %arrayidx10.7 = getelementptr inbounds float, float* %5, i64 %add.7 - %21 = load float, float* %arrayidx10.7, align 4, !tbaa !17 + %21 = load float, float* %arrayidx10.7, align 4, !tbaa !20 %add14.7 = add i64 %mul, 8 %arrayidx15.7 = getelementptr inbounds float, float* %5, i64 %add14.7 - %22 = load float, float* %arrayidx15.7, align 4, !tbaa !17 + %22 = load float, float* %arrayidx15.7, align 4, !tbaa !20 %cmp16.7 = fcmp fast olt float %21, %22 %chosen.1.7 = select i1 %cmp16.7, i32 8, i32 %chosen.1.6 %conv9.8 = sext i32 %chosen.1.7 to i64 %add.8 = add i64 %conv9.8, %mul %arrayidx10.8 = getelementptr inbounds float, float* %5, i64 %add.8 - %23 = load float, float* %arrayidx10.8, align 4, !tbaa !17 + %23 = load float, float* %arrayidx10.8, align 4, !tbaa !20 %add14.8 = add i64 %mul, 9 %arrayidx15.8 = getelementptr inbounds float, float* %5, i64 %add14.8 - %24 = load float, float* %arrayidx15.8, align 4, !tbaa !17 + %24 = load float, float* %arrayidx15.8, align 4, !tbaa !20 %cmp16.8 = fcmp fast olt float %23, %24 %chosen.1.8 = select i1 %cmp16.8, i32 9, i32 %chosen.1.7 %arrayidx17 = getelementptr inbounds i8, i8* %call.i, i64 %indvars.iv - %25 = load i8, i8* %arrayidx17, align 1, !tbaa !36 + %25 = load i8, i8* %arrayidx17, align 1, !tbaa !42 %conv18 = zext i8 %25 to i32 %not.cmp19 = icmp ne i32 %chosen.1.8, %conv18 %inc21 = zext i1 %not.cmp19 to i32 @@ -1511,31 +1900,31 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo %mul31 = fmul fast double %div, 1.000000e+02 %conv32 = fptrunc double %mul31 to float %conv33 = fpext float %conv32 to double - %call34 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.20, i64 0, i64 0), double %conv33) - %call35 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.21, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.22, i64 0, i64 0)) + %call34 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv33) + %call35 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) %cmp36 = icmp eq %struct._IO_FILE* %call35, null br i1 %cmp36, label %if.end44, label %if.then37 if.then37: ; preds = %for.cond.cleanup %26 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* - call void @llvm.lifetime.start(i64 376, i8* nonnull %26) #7 + call void @llvm.lifetime.start(i64 376, i8* nonnull %26) #2 %27 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 %28 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0 - call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %28) #7 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %28) #2 %29 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !40 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !46 %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 - store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !42 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 - store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !45 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 - store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !46 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 %30 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %30, i8 0, i64 32, i32 8, i1 false) #7 + call void @llvm.memset.p0i8.i64(i8* %30, i8 0, i64 32, i32 8, i1 false) #2 %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 %32 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* - store i64 %31, i64* %32, align 16, !tbaa !40 + store i64 %31, i64* %32, align 16, !tbaa !46 %33 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 %34 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** %vtable.cast.i.i = inttoptr i64 %31 to i8* @@ -1544,79 +1933,79 @@ if.then37: ; preds = %for.cond.cleanup %vbase.offset.i.i = load i64, i64* %35, align 8 %add.ptr.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i %36 = bitcast i8* %add.ptr.i.i to i64* - store i64 %33, i64* %36, align 8, !tbaa !40 - %vtable3.i.i = load i8*, i8** %34, align 16, !tbaa !40 + store i64 %33, i64* %36, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %34, align 16, !tbaa !46 %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 %37 = bitcast i8* %vbase.offset.ptr4.i.i to i64* %vbase.offset5.i.i = load i64, i64* %37, align 8 %add.ptr6.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset5.i.i %38 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %38, %"class.std::basic_streambuf"* null) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !40 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %38, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !46 %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 %39 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 %40 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* - store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %40, align 16, !tbaa !40 + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %40, align 16, !tbaa !46 %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 %41 = bitcast i8** %_M_in_beg.i.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 48, i32 8, i1 false) #7 - call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %39, align 8, !tbaa !40 + call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %39, align 8, !tbaa !46 %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 - store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !47 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 %42 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 %43 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** - store %union.anon* %42, %union.anon** %43, align 8, !tbaa !52 + store %union.anon* %42, %union.anon** %43, align 8, !tbaa !58 %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 - store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !53 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 %.cast.i.i.i = bitcast %union.anon* %42 to i8* - store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !36 - %vtable.i = load i8*, i8** %34, align 16, !tbaa !40 + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %34, align 16, !tbaa !46 %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 %44 = bitcast i8* %vbase.offset.ptr.i to i64* %vbase.offset.i = load i64, i64* %44, align 8 %add.ptr2.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i %45 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* %46 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %45, %"class.std::basic_streambuf"* %46) #7 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %45, %"class.std::basic_streambuf"* %46) #2 %47 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* - %vtable.i74 = load i8*, i8** %34, align 16, !tbaa !40 + %vtable.i74 = load i8*, i8** %34, align 16, !tbaa !46 %vbase.offset.ptr.i75 = getelementptr i8, i8* %vtable.i74, i64 -24 %48 = bitcast i8* %vbase.offset.ptr.i75 to i64* %vbase.offset.i76 = load i64, i64* %48, align 8 %add.ptr.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i76 %_M_flags.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 %49 = bitcast i8* %_M_flags.i to i32* - %50 = load i32, i32* %49, align 4, !tbaa !54 + %50 = load i32, i32* %49, align 4, !tbaa !60 %and.i = and i32 %50, -261 %or.i = or i32 %and.i, 4 - store i32 %or.i, i32* %49, align 4, !tbaa !54 - %call.i84 = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %47, double %conv33) #7 + store i32 %or.i, i32* %49, align 4, !tbaa !60 + %call.i84 = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %47, double %conv33) #2 %51 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %51) #7 - call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %51) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 - %52 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !56 + %52 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 - %53 = load i64, i64* %_M_string_length.i, align 8, !tbaa !53 + %53 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 %call42 = call i64 @fwrite(i8* %52, i64 1, i64 %53, %struct._IO_FILE* nonnull %call35) %call43 = call i32 @fclose(%struct._IO_FILE* nonnull %call35) - %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !56 + %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 %55 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 %arraydecay.i.i.i.i = bitcast %union.anon* %55 to i8* %cmp.i.i.i = icmp eq i8* %54, %arraydecay.i.i.i.i br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i if.then.i.i: ; preds = %if.then37 - call void @_ZdlPv(i8* %54) #7 + call void @_ZdlPv(i8* %54) #2 br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then37, %if.then.i.i - call void @llvm.lifetime.end(i64 32, i8* nonnull %51) #7 + call void @llvm.lifetime.end(i64 32, i8* nonnull %51) #2 %56 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 - store i64 %56, i64* %32, align 16, !tbaa !40 + store i64 %56, i64* %32, align 16, !tbaa !46 %57 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 %vtable.cast.i.i86 = inttoptr i64 %56 to i8* %vbase.offset.ptr.i.i87 = getelementptr i8, i8* %vtable.cast.i.i86, i64 -24 @@ -1624,24 +2013,24 @@ _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.th %vbase.offset.i.i88 = load i64, i64* %58, align 8 %add.ptr.i.i89 = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i88 %59 = bitcast i8* %add.ptr.i.i89 to i64* - store i64 %57, i64* %59, align 8, !tbaa !40 + store i64 %57, i64* %59, align 8, !tbaa !46 %60 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !40 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !46 %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 - %61 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !56 + %61 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 %cmp.i.i.i.i.i.i = icmp eq i8* %61, %.cast.i.i.i br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit - call void @_ZdlPv(i8* %61) #7 + call void @_ZdlPv(i8* %61) #2 br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit _ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !40 - call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #7 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 %62 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 - call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %62) #7 - call void @llvm.lifetime.end(i64 376, i8* nonnull %26) #7 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %62) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %26) #2 br label %if.end44 if.end44: ; preds = %for.cond.cleanup, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit @@ -1649,1016 +2038,3342 @@ if.end44: ; preds = %for.cond.cleanup, % } ; Function Attrs: nounwind uwtable -define void @_Z16computeAccuracy2PhiPv(i8* nocapture readonly %labels, i32 %num_labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #0 { +define float @_Z16computeAccuracy2PhiPvm(i8* nocapture readonly %labels, i32 %batch_size, i8* nocapture readonly %result_ptr, i64 %num_classes) local_unnamed_addr #3 { entry: %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 %print_str = alloca %"class.std::__cxx11::basic_string", align 8 - %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 72 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 %0 = bitcast i8* %dim_sizes to i64** - %1 = load i64*, i64** %0, align 8, !tbaa !11 - %2 = load i64, i64* %1, align 8, !tbaa !12 + %1 = load i64*, i64** %0, align 8, !tbaa !14 + %2 = load i64, i64* %1, align 8, !tbaa !15 %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 - %3 = load i64, i64* %arrayidx3, align 8, !tbaa !12 - %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 32 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !15 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 %4 = bitcast i8* %host_data to float** - %5 = load float*, float** %4, align 8, !tbaa !15 - %cmp82 = icmp eq i64 %2, 0 - br i1 %cmp82, label %for.cond.cleanup, label %for.cond4.preheader.preheader - -for.cond4.preheader.preheader: ; preds = %entry + %5 = load float*, float** %4, align 8, !tbaa !17 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.24, i64 0, i64 0), i64 %2, i64 %3) + %cmp89 = icmp eq i64 %2, 0 + br i1 %cmp89, label %for.cond.cleanup, label %for.cond4.preheader.lr.ph + +for.cond4.preheader.lr.ph: ; preds = %entry + %cmp685 = icmp ugt i64 %3, 1 + br i1 %cmp685, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader + +for.cond4.preheader.us.preheader: ; preds = %for.cond4.preheader.lr.ph + %6 = and i64 %3, 1 + %lcmp.mod = icmp eq i64 %6, 0 + %7 = icmp eq i64 %3, 2 + br label %for.cond4.preheader.us + +for.cond4.preheader.preheader: ; preds = %for.cond4.preheader.lr.ph + %min.iters.check = icmp ult i64 %2, 8 + br i1 %min.iters.check, label %for.cond4.preheader.preheader111, label %min.iters.checked + +for.cond4.preheader.preheader111: ; preds = %middle.block, %vector.scevcheck, %min.iters.checked, %for.cond4.preheader.preheader + %conv92.ph = phi i64 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %n.vec, %middle.block ] + %num_errors.091.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %45, %middle.block ] + %i.090.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %cast.crd, %middle.block ] br label %for.cond4.preheader -for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.cond4.preheader - %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond4.preheader ], [ 0, %for.cond4.preheader.preheader ] - %num_errors.084 = phi i32 [ %num_errors.0.inc21, %for.cond4.preheader ], [ 0, %for.cond4.preheader.preheader ] - %mul = mul i64 %indvars.iv, %3 - %arrayidx10 = getelementptr inbounds float, float* %5, i64 %mul - %6 = load float, float* %arrayidx10, align 4, !tbaa !17 - %add14 = add i64 %mul, 1 - %arrayidx15 = getelementptr inbounds float, float* %5, i64 %add14 - %7 = load float, float* %arrayidx15, align 4, !tbaa !17 - %cmp16 = fcmp fast olt float %6, %7 - %chosen.1 = zext i1 %cmp16 to i32 - %conv9.1 = zext i1 %cmp16 to i64 - %add.1 = add i64 %conv9.1, %mul - %arrayidx10.1 = getelementptr inbounds float, float* %5, i64 %add.1 - %8 = load float, float* %arrayidx10.1, align 4, !tbaa !17 - %add14.1 = add i64 %mul, 2 - %arrayidx15.1 = getelementptr inbounds float, float* %5, i64 %add14.1 - %9 = load float, float* %arrayidx15.1, align 4, !tbaa !17 - %cmp16.1 = fcmp fast olt float %8, %9 - %chosen.1.1 = select i1 %cmp16.1, i32 2, i32 %chosen.1 - %conv9.286 = zext i32 %chosen.1.1 to i64 - %add.2 = add i64 %conv9.286, %mul - %arrayidx10.2 = getelementptr inbounds float, float* %5, i64 %add.2 - %10 = load float, float* %arrayidx10.2, align 4, !tbaa !17 - %add14.2 = add i64 %mul, 3 - %arrayidx15.2 = getelementptr inbounds float, float* %5, i64 %add14.2 - %11 = load float, float* %arrayidx15.2, align 4, !tbaa !17 - %cmp16.2 = fcmp fast olt float %10, %11 - %chosen.1.2 = select i1 %cmp16.2, i32 3, i32 %chosen.1.1 - %conv9.387 = zext i32 %chosen.1.2 to i64 - %add.3 = add i64 %conv9.387, %mul - %arrayidx10.3 = getelementptr inbounds float, float* %5, i64 %add.3 - %12 = load float, float* %arrayidx10.3, align 4, !tbaa !17 - %add14.3 = add i64 %mul, 4 - %arrayidx15.3 = getelementptr inbounds float, float* %5, i64 %add14.3 - %13 = load float, float* %arrayidx15.3, align 4, !tbaa !17 - %cmp16.3 = fcmp fast olt float %12, %13 - %chosen.1.3 = select i1 %cmp16.3, i32 4, i32 %chosen.1.2 - %conv9.488 = zext i32 %chosen.1.3 to i64 - %add.4 = add i64 %conv9.488, %mul - %arrayidx10.4 = getelementptr inbounds float, float* %5, i64 %add.4 - %14 = load float, float* %arrayidx10.4, align 4, !tbaa !17 - %add14.4 = add i64 %mul, 5 - %arrayidx15.4 = getelementptr inbounds float, float* %5, i64 %add14.4 - %15 = load float, float* %arrayidx15.4, align 4, !tbaa !17 - %cmp16.4 = fcmp fast olt float %14, %15 - %chosen.1.4 = select i1 %cmp16.4, i32 5, i32 %chosen.1.3 - %conv9.589 = zext i32 %chosen.1.4 to i64 - %add.5 = add i64 %conv9.589, %mul - %arrayidx10.5 = getelementptr inbounds float, float* %5, i64 %add.5 - %16 = load float, float* %arrayidx10.5, align 4, !tbaa !17 - %add14.5 = add i64 %mul, 6 - %arrayidx15.5 = getelementptr inbounds float, float* %5, i64 %add14.5 - %17 = load float, float* %arrayidx15.5, align 4, !tbaa !17 - %cmp16.5 = fcmp fast olt float %16, %17 - %chosen.1.5 = select i1 %cmp16.5, i32 6, i32 %chosen.1.4 - %18 = zext i32 %chosen.1.5 to i64 - %add.6 = add i64 %18, %mul - %arrayidx10.6 = getelementptr inbounds float, float* %5, i64 %add.6 - %19 = load float, float* %arrayidx10.6, align 4, !tbaa !17 - %add14.6 = add i64 %mul, 7 - %arrayidx15.6 = getelementptr inbounds float, float* %5, i64 %add14.6 - %20 = load float, float* %arrayidx15.6, align 4, !tbaa !17 - %cmp16.6 = fcmp fast olt float %19, %20 - %chosen.1.6 = select i1 %cmp16.6, i32 7, i32 %chosen.1.5 - %conv9.7 = sext i32 %chosen.1.6 to i64 - %add.7 = add i64 %conv9.7, %mul - %arrayidx10.7 = getelementptr inbounds float, float* %5, i64 %add.7 - %21 = load float, float* %arrayidx10.7, align 4, !tbaa !17 - %add14.7 = add i64 %mul, 8 - %arrayidx15.7 = getelementptr inbounds float, float* %5, i64 %add14.7 - %22 = load float, float* %arrayidx15.7, align 4, !tbaa !17 - %cmp16.7 = fcmp fast olt float %21, %22 - %chosen.1.7 = select i1 %cmp16.7, i32 8, i32 %chosen.1.6 - %conv9.8 = sext i32 %chosen.1.7 to i64 - %add.8 = add i64 %conv9.8, %mul - %arrayidx10.8 = getelementptr inbounds float, float* %5, i64 %add.8 - %23 = load float, float* %arrayidx10.8, align 4, !tbaa !17 - %add14.8 = add i64 %mul, 9 - %arrayidx15.8 = getelementptr inbounds float, float* %5, i64 %add14.8 - %24 = load float, float* %arrayidx15.8, align 4, !tbaa !17 - %cmp16.8 = fcmp fast olt float %23, %24 - %chosen.1.8 = select i1 %cmp16.8, i32 9, i32 %chosen.1.7 - %arrayidx17 = getelementptr inbounds i8, i8* %labels, i64 %indvars.iv - %25 = load i8, i8* %arrayidx17, align 1, !tbaa !36 - %conv18 = zext i8 %25 to i32 - %not.cmp19 = icmp ne i32 %chosen.1.8, %conv18 - %inc21 = zext i1 %not.cmp19 to i32 - %num_errors.0.inc21 = add nsw i32 %inc21, %num_errors.084 +min.iters.checked: ; preds = %for.cond4.preheader.preheader + %n.vec = and i64 %2, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + br i1 %cmp.zero, label %for.cond4.preheader.preheader111, label %vector.scevcheck + +vector.scevcheck: ; preds = %min.iters.checked + %8 = add i64 %2, -1 + %9 = trunc i64 %8 to i32 + %10 = icmp eq i32 %9, -1 + %11 = icmp ugt i64 %8, 4294967295 + %12 = or i1 %10, %11 + %cast.crd = trunc i64 %n.vec to i32 + br i1 %12, label %for.cond4.preheader.preheader111, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.scevcheck + %13 = add i64 %n.vec, -8 + %14 = lshr exact i64 %13, 3 + %15 = and i64 %14, 1 + %lcmp.mod115 = icmp eq i64 %15, 0 + br i1 %lcmp.mod115, label %vector.body.prol.preheader, label %vector.body.prol.loopexit + +vector.body.prol.preheader: ; preds = %vector.body.preheader + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol.preheader + %16 = bitcast i8* %labels to <4 x i8>* + %wide.load.prol = load <4 x i8>, <4 x i8>* %16, align 1, !tbaa !42 + %17 = getelementptr i8, i8* %labels, i64 4 + %18 = bitcast i8* %17 to <4 x i8>* + %wide.load107.prol = load <4 x i8>, <4 x i8>* %18, align 1, !tbaa !42 + %19 = icmp ne <4 x i8> %wide.load.prol, zeroinitializer + %20 = icmp ne <4 x i8> %wide.load107.prol, zeroinitializer + %21 = zext <4 x i1> %19 to <4 x i32> + %22 = zext <4 x i1> %20 to <4 x i32> + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader + %.lcssa113.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %21, %vector.body.prol ] + %.lcssa.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %22, %vector.body.prol ] + %index.unr = phi i64 [ 0, %vector.body.preheader ], [ 8, %vector.body.prol ] + %vec.phi.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %21, %vector.body.prol ] + %vec.phi102.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %22, %vector.body.prol ] + %23 = icmp eq i64 %14, 0 + br i1 %23, label %middle.block, label %vector.body.preheader.new + +vector.body.preheader.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.body.preheader.new + %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] + %vec.phi = phi <4 x i32> [ %vec.phi.unr, %vector.body.preheader.new ], [ %42, %vector.body ] + %vec.phi102 = phi <4 x i32> [ %vec.phi102.unr, %vector.body.preheader.new ], [ %43, %vector.body ] + %24 = getelementptr inbounds i8, i8* %labels, i64 %index + %25 = bitcast i8* %24 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %25, align 1, !tbaa !42 + %26 = getelementptr i8, i8* %24, i64 4 + %27 = bitcast i8* %26 to <4 x i8>* + %wide.load107 = load <4 x i8>, <4 x i8>* %27, align 1, !tbaa !42 + %28 = icmp ne <4 x i8> %wide.load, zeroinitializer + %29 = icmp ne <4 x i8> %wide.load107, zeroinitializer + %30 = zext <4 x i1> %28 to <4 x i32> + %31 = zext <4 x i1> %29 to <4 x i32> + %32 = add nsw <4 x i32> %30, %vec.phi + %33 = add nsw <4 x i32> %31, %vec.phi102 + %index.next = add i64 %index, 8 + %34 = getelementptr inbounds i8, i8* %labels, i64 %index.next + %35 = bitcast i8* %34 to <4 x i8>* + %wide.load.1 = load <4 x i8>, <4 x i8>* %35, align 1, !tbaa !42 + %36 = getelementptr i8, i8* %34, i64 4 + %37 = bitcast i8* %36 to <4 x i8>* + %wide.load107.1 = load <4 x i8>, <4 x i8>* %37, align 1, !tbaa !42 + %38 = icmp ne <4 x i8> %wide.load.1, zeroinitializer + %39 = icmp ne <4 x i8> %wide.load107.1, zeroinitializer + %40 = zext <4 x i1> %38 to <4 x i32> + %41 = zext <4 x i1> %39 to <4 x i32> + %42 = add nsw <4 x i32> %40, %32 + %43 = add nsw <4 x i32> %41, %33 + %index.next.1 = add i64 %index, 16 + %44 = icmp eq i64 %index.next.1, %n.vec + br i1 %44, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !63 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %vector.body.prol.loopexit, %middle.block.unr-lcssa + %.lcssa113 = phi <4 x i32> [ %.lcssa113.unr, %vector.body.prol.loopexit ], [ %42, %middle.block.unr-lcssa ] + %.lcssa = phi <4 x i32> [ %.lcssa.unr, %vector.body.prol.loopexit ], [ %43, %middle.block.unr-lcssa ] + %bin.rdx = add <4 x i32> %.lcssa, %.lcssa113 + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx108 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf109 = shufflevector <4 x i32> %bin.rdx108, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx110 = add <4 x i32> %bin.rdx108, %rdx.shuf109 + %45 = extractelement <4 x i32> %bin.rdx110, i32 0 + %cmp.n = icmp eq i64 %2, %n.vec + br i1 %cmp.n, label %for.cond.cleanup, label %for.cond4.preheader.preheader111 + +for.cond4.preheader.us: ; preds = %for.cond4.preheader.us.preheader, %for.cond4.for.cond.cleanup7_crit_edge.us + %conv92.us = phi i64 [ %conv.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %num_errors.091.us = phi i32 [ %num_errors.0.inc22.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %i.090.us = phi i32 [ %inc25.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %mul.us = mul i64 %conv92.us, %3 + br i1 %lcmp.mod, label %for.body8.us.prol.preheader, label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.preheader: ; preds = %for.cond4.preheader.us + br label %for.body8.us.prol + +for.body8.us.prol: ; preds = %for.body8.us.prol.preheader + %arrayidx11.us.prol = getelementptr inbounds float, float* %5, i64 %mul.us + %46 = load float, float* %arrayidx11.us.prol, align 4, !tbaa !20 + %add15.us.prol = add i64 %mul.us, 1 + %arrayidx16.us.prol = getelementptr inbounds float, float* %5, i64 %add15.us.prol + %47 = load float, float* %arrayidx16.us.prol, align 4, !tbaa !20 + %cmp17.us.prol = fcmp fast olt float %46, %47 + %chosen.1.us.prol = zext i1 %cmp17.us.prol to i32 + br label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.loopexit.unr-lcssa: ; preds = %for.cond4.preheader.us, %for.body8.us.prol + %chosen.1.us.lcssa.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ undef, %for.cond4.preheader.us ] + %indvars.iv.unr.ph = phi i64 [ 2, %for.body8.us.prol ], [ 1, %for.cond4.preheader.us ] + %chosen.086.us.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ 0, %for.cond4.preheader.us ] + br label %for.body8.us.prol.loopexit + +for.body8.us.prol.loopexit: ; preds = %for.body8.us.prol.loopexit.unr-lcssa + br i1 %7, label %for.cond4.for.cond.cleanup7_crit_edge.us, label %for.cond4.preheader.us.new + +for.cond4.preheader.us.new: ; preds = %for.body8.us.prol.loopexit + br label %for.body8.us + +for.body8.us: ; preds = %for.body8.us, %for.cond4.preheader.us.new + %indvars.iv = phi i64 [ %indvars.iv.unr.ph, %for.cond4.preheader.us.new ], [ %indvars.iv.next.1, %for.body8.us ] + %chosen.086.us = phi i32 [ %chosen.086.us.unr.ph, %for.cond4.preheader.us.new ], [ %chosen.1.us.1, %for.body8.us ] + %conv10.us = sext i32 %chosen.086.us to i64 + %add.us = add i64 %conv10.us, %mul.us + %arrayidx11.us = getelementptr inbounds float, float* %5, i64 %add.us + %48 = load float, float* %arrayidx11.us, align 4, !tbaa !20 + %add15.us = add i64 %indvars.iv, %mul.us + %arrayidx16.us = getelementptr inbounds float, float* %5, i64 %add15.us + %49 = load float, float* %arrayidx16.us, align 4, !tbaa !20 + %cmp17.us = fcmp fast olt float %48, %49 + %50 = trunc i64 %indvars.iv to i32 + %chosen.1.us = select i1 %cmp17.us, i32 %50, i32 %chosen.086.us %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, %2 - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.cond4.preheader + %conv10.us.1 = sext i32 %chosen.1.us to i64 + %add.us.1 = add i64 %conv10.us.1, %mul.us + %arrayidx11.us.1 = getelementptr inbounds float, float* %5, i64 %add.us.1 + %51 = load float, float* %arrayidx11.us.1, align 4, !tbaa !20 + %add15.us.1 = add i64 %indvars.iv.next, %mul.us + %arrayidx16.us.1 = getelementptr inbounds float, float* %5, i64 %add15.us.1 + %52 = load float, float* %arrayidx16.us.1, align 4, !tbaa !20 + %cmp17.us.1 = fcmp fast olt float %51, %52 + %53 = trunc i64 %indvars.iv.next to i32 + %chosen.1.us.1 = select i1 %cmp17.us.1, i32 %53, i32 %chosen.1.us + %indvars.iv.next.1 = add nsw i64 %indvars.iv, 2 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %3 + br i1 %exitcond.1, label %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa, label %for.body8.us + +for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa: ; preds = %for.body8.us + br label %for.cond4.for.cond.cleanup7_crit_edge.us + +for.cond4.for.cond.cleanup7_crit_edge.us: ; preds = %for.body8.us.prol.loopexit, %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa + %chosen.1.us.lcssa = phi i32 [ %chosen.1.us.lcssa.unr.ph, %for.body8.us.prol.loopexit ], [ %chosen.1.us.1, %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa ] + %arrayidx18.us = getelementptr inbounds i8, i8* %labels, i64 %conv92.us + %54 = load i8, i8* %arrayidx18.us, align 1, !tbaa !42 + %conv19.us = zext i8 %54 to i32 + %not.cmp20.us = icmp ne i32 %chosen.1.us.lcssa, %conv19.us + %inc22.us = zext i1 %not.cmp20.us to i32 + %num_errors.0.inc22.us = add nsw i32 %inc22.us, %num_errors.091.us + %inc25.us = add i32 %i.090.us, 1 + %conv.us = zext i32 %inc25.us to i64 + %cmp.us = icmp ult i64 %conv.us, %2 + br i1 %cmp.us, label %for.cond4.preheader.us, label %for.cond.cleanup.loopexit + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader111, %for.cond4.preheader + %conv92 = phi i64 [ %conv, %for.cond4.preheader ], [ %conv92.ph, %for.cond4.preheader.preheader111 ] + %num_errors.091 = phi i32 [ %num_errors.0.inc22, %for.cond4.preheader ], [ %num_errors.091.ph, %for.cond4.preheader.preheader111 ] + %i.090 = phi i32 [ %inc25, %for.cond4.preheader ], [ %i.090.ph, %for.cond4.preheader.preheader111 ] + %arrayidx18 = getelementptr inbounds i8, i8* %labels, i64 %conv92 + %55 = load i8, i8* %arrayidx18, align 1, !tbaa !42 + %not.cmp20 = icmp ne i8 %55, 0 + %inc22 = zext i1 %not.cmp20 to i32 + %num_errors.0.inc22 = add nsw i32 %inc22, %num_errors.091 + %inc25 = add i32 %i.090, 1 + %conv = zext i32 %inc25 to i64 + %cmp = icmp ult i64 %conv, %2 + br i1 %cmp, label %for.cond4.preheader, label %for.cond.cleanup.loopexit112, !llvm.loop !64 + +for.cond.cleanup.loopexit: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us + br label %for.cond.cleanup -for.cond.cleanup.loopexit: ; preds = %for.cond4.preheader - %phitmp = sext i32 %num_errors.0.inc21 to i64 +for.cond.cleanup.loopexit112: ; preds = %for.cond4.preheader br label %for.cond.cleanup -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %num_errors.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] - %sub = sub i64 %2, %num_errors.0.lcssa +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit112, %for.cond.cleanup.loopexit, %middle.block, %entry + %num_errors.0.lcssa = phi i32 [ 0, %entry ], [ %45, %middle.block ], [ %num_errors.0.inc22.us, %for.cond.cleanup.loopexit ], [ %num_errors.0.inc22, %for.cond.cleanup.loopexit112 ] + %conv27 = sext i32 %num_errors.0.lcssa to i64 + %sub = sub i64 %2, %conv27 + %conv28 = uitofp i64 %sub to double + %conv30 = uitofp i64 %2 to double + %div = fdiv fast double %conv28, %conv30 + %mul32 = fmul fast double %div, 1.000000e+02 + %conv33 = fptrunc double %mul32 to float + %conv34 = fpext float %conv33 to double + %call35 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv34) + %call36 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp37 = icmp eq %struct._IO_FILE* %call36, null + br i1 %cmp37, label %if.end44, label %if.then38 + +if.then38: ; preds = %for.cond.cleanup + %56 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %56) #2 + %57 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %58 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %57, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %58) #2 + %59 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %57, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %59, align 16, !tbaa !46 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %60 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %60, i8 0, i64 32, i32 8, i1 false) #2 + %61 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %62 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %61, i64* %62, align 16, !tbaa !46 + %63 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %64 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %61 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %65 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %65, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i.i + %66 = bitcast i8* %add.ptr.i.i to i64* + store i64 %63, i64* %66, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %64, align 16, !tbaa !46 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %67 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %67, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset5.i.i + %68 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %68, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %59, align 16, !tbaa !46 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %69 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %70 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %70, align 16, !tbaa !46 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %71 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %71, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %69, align 8, !tbaa !46 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %72 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %73 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %72, %union.anon** %73, align 8, !tbaa !58 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + %.cast.i.i.i = bitcast %union.anon* %72 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %64, align 16, !tbaa !46 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %74 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %74, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i + %75 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %76 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %75, %"class.std::basic_streambuf"* %76) #2 + %77 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i77 = load i8*, i8** %64, align 16, !tbaa !46 + %vbase.offset.ptr.i78 = getelementptr i8, i8* %vtable.i77, i64 -24 + %78 = bitcast i8* %vbase.offset.ptr.i78 to i64* + %vbase.offset.i79 = load i64, i64* %78, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i79 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %79 = bitcast i8* %_M_flags.i.i to i32* + %80 = load i32, i32* %79, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %80, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %79, align 4, !tbaa !60 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %77, double %conv34) #2 + %81 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %81) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %82 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %83 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call43 = call i64 @fwrite(i8* %82, i64 1, i64 %83, %struct._IO_FILE* nonnull %call36) + %84 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %85 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %85 to i8* + %cmp.i.i.i = icmp eq i8* %84, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then38 + call void @_ZdlPv(i8* %84) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then38, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %81) #2 + %86 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %86, i64* %62, align 16, !tbaa !46 + %87 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i81 = inttoptr i64 %86 to i8* + %vbase.offset.ptr.i.i82 = getelementptr i8, i8* %vtable.cast.i.i81, i64 -24 + %88 = bitcast i8* %vbase.offset.ptr.i.i82 to i64* + %vbase.offset.i.i83 = load i64, i64* %88, align 8 + %add.ptr.i.i84 = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i.i83 + %89 = bitcast i8* %add.ptr.i.i84 to i64* + store i64 %87, i64* %89, align 8, !tbaa !46 + %90 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %90, align 8, !tbaa !46 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %91 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %91, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %91) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %90, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %92 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %92) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %56) #2 + br label %if.end44 + +if.end44: ; preds = %for.cond.cleanup, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %call45 = call i32 @fclose(%struct._IO_FILE* %call36) + ret float %conv33 +} + +; Function Attrs: nounwind uwtable +define float @_Z16computeAccuracy3PjPv(i32* nocapture readonly %labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 + %0 = bitcast i8* %dim_sizes to i64** + %1 = load i64*, i64** %0, align 8, !tbaa !14 + %2 = load i64, i64* %1, align 8, !tbaa !15 + %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !15 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 + %4 = bitcast i8* %host_data to float** + %5 = load float*, float** %4, align 8, !tbaa !17 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.25, i64 0, i64 0), i64 %2, i64 %3) + %cmp89 = icmp eq i64 %2, 0 + br i1 %cmp89, label %for.cond.cleanup, label %for.cond4.preheader.lr.ph + +for.cond4.preheader.lr.ph: ; preds = %entry + %cmp685 = icmp ugt i64 %3, 1 + br i1 %cmp685, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader + +for.cond4.preheader.us.preheader: ; preds = %for.cond4.preheader.lr.ph + %6 = and i64 %3, 1 + %lcmp.mod = icmp eq i64 %6, 0 + %7 = icmp eq i64 %3, 2 + br label %for.cond4.preheader.us + +for.cond4.preheader.preheader: ; preds = %for.cond4.preheader.lr.ph + %min.iters.check = icmp ult i64 %2, 8 + br i1 %min.iters.check, label %for.cond4.preheader.preheader109, label %min.iters.checked + +for.cond4.preheader.preheader109: ; preds = %middle.block, %min.iters.checked, %for.cond4.preheader.preheader + %indvars.iv98.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %n.vec, %middle.block ] + %num_errors.091.ph = phi i32 [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %40, %middle.block ] + br label %for.cond4.preheader + +min.iters.checked: ; preds = %for.cond4.preheader.preheader + %n.vec = and i64 %2, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + br i1 %cmp.zero, label %for.cond4.preheader.preheader109, label %vector.body.preheader + +vector.body.preheader: ; preds = %min.iters.checked + %8 = add i64 %n.vec, -8 + %9 = lshr exact i64 %8, 3 + %10 = and i64 %9, 1 + %lcmp.mod113 = icmp eq i64 %10, 0 + br i1 %lcmp.mod113, label %vector.body.prol.preheader, label %vector.body.prol.loopexit + +vector.body.prol.preheader: ; preds = %vector.body.preheader + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol.preheader + %11 = bitcast i32* %labels to <4 x i32>* + %wide.load.prol = load <4 x i32>, <4 x i32>* %11, align 4, !tbaa !70 + %12 = getelementptr i32, i32* %labels, i64 4 + %13 = bitcast i32* %12 to <4 x i32>* + %wide.load105.prol = load <4 x i32>, <4 x i32>* %13, align 4, !tbaa !70 + %14 = icmp ne <4 x i32> %wide.load.prol, zeroinitializer + %15 = icmp ne <4 x i32> %wide.load105.prol, zeroinitializer + %16 = zext <4 x i1> %14 to <4 x i32> + %17 = zext <4 x i1> %15 to <4 x i32> + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader + %.lcssa111.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %16, %vector.body.prol ] + %.lcssa.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %17, %vector.body.prol ] + %index.unr = phi i64 [ 0, %vector.body.preheader ], [ 8, %vector.body.prol ] + %vec.phi.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %16, %vector.body.prol ] + %vec.phi104.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %17, %vector.body.prol ] + %18 = icmp eq i64 %9, 0 + br i1 %18, label %middle.block, label %vector.body.preheader.new + +vector.body.preheader.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.body.preheader.new + %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] + %vec.phi = phi <4 x i32> [ %vec.phi.unr, %vector.body.preheader.new ], [ %37, %vector.body ] + %vec.phi104 = phi <4 x i32> [ %vec.phi104.unr, %vector.body.preheader.new ], [ %38, %vector.body ] + %19 = getelementptr inbounds i32, i32* %labels, i64 %index + %20 = bitcast i32* %19 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %20, align 4, !tbaa !70 + %21 = getelementptr i32, i32* %19, i64 4 + %22 = bitcast i32* %21 to <4 x i32>* + %wide.load105 = load <4 x i32>, <4 x i32>* %22, align 4, !tbaa !70 + %23 = icmp ne <4 x i32> %wide.load, zeroinitializer + %24 = icmp ne <4 x i32> %wide.load105, zeroinitializer + %25 = zext <4 x i1> %23 to <4 x i32> + %26 = zext <4 x i1> %24 to <4 x i32> + %27 = add nsw <4 x i32> %25, %vec.phi + %28 = add nsw <4 x i32> %26, %vec.phi104 + %index.next = add i64 %index, 8 + %29 = getelementptr inbounds i32, i32* %labels, i64 %index.next + %30 = bitcast i32* %29 to <4 x i32>* + %wide.load.1 = load <4 x i32>, <4 x i32>* %30, align 4, !tbaa !70 + %31 = getelementptr i32, i32* %29, i64 4 + %32 = bitcast i32* %31 to <4 x i32>* + %wide.load105.1 = load <4 x i32>, <4 x i32>* %32, align 4, !tbaa !70 + %33 = icmp ne <4 x i32> %wide.load.1, zeroinitializer + %34 = icmp ne <4 x i32> %wide.load105.1, zeroinitializer + %35 = zext <4 x i1> %33 to <4 x i32> + %36 = zext <4 x i1> %34 to <4 x i32> + %37 = add nsw <4 x i32> %35, %27 + %38 = add nsw <4 x i32> %36, %28 + %index.next.1 = add i64 %index, 16 + %39 = icmp eq i64 %index.next.1, %n.vec + br i1 %39, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !71 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %vector.body.prol.loopexit, %middle.block.unr-lcssa + %.lcssa111 = phi <4 x i32> [ %.lcssa111.unr, %vector.body.prol.loopexit ], [ %37, %middle.block.unr-lcssa ] + %.lcssa = phi <4 x i32> [ %.lcssa.unr, %vector.body.prol.loopexit ], [ %38, %middle.block.unr-lcssa ] + %bin.rdx = add <4 x i32> %.lcssa, %.lcssa111 + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx106 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf107 = shufflevector <4 x i32> %bin.rdx106, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx108 = add <4 x i32> %bin.rdx106, %rdx.shuf107 + %40 = extractelement <4 x i32> %bin.rdx108, i32 0 + %cmp.n = icmp eq i64 %2, %n.vec + br i1 %cmp.n, label %for.cond.cleanup, label %for.cond4.preheader.preheader109 + +for.cond4.preheader.us: ; preds = %for.cond4.preheader.us.preheader, %for.cond4.for.cond.cleanup7_crit_edge.us + %indvars.iv95 = phi i64 [ %indvars.iv.next96, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %num_errors.091.us = phi i32 [ %num_errors.0.inc21.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %mul.us = mul i64 %indvars.iv95, %3 + br i1 %lcmp.mod, label %for.body8.us.prol.preheader, label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.preheader: ; preds = %for.cond4.preheader.us + br label %for.body8.us.prol + +for.body8.us.prol: ; preds = %for.body8.us.prol.preheader + %arrayidx11.us.prol = getelementptr inbounds float, float* %5, i64 %mul.us + %41 = load float, float* %arrayidx11.us.prol, align 4, !tbaa !20 + %add15.us.prol = add i64 %mul.us, 1 + %arrayidx16.us.prol = getelementptr inbounds float, float* %5, i64 %add15.us.prol + %42 = load float, float* %arrayidx16.us.prol, align 4, !tbaa !20 + %cmp17.us.prol = fcmp fast olt float %41, %42 + %chosen.1.us.prol = zext i1 %cmp17.us.prol to i32 + br label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.loopexit.unr-lcssa: ; preds = %for.cond4.preheader.us, %for.body8.us.prol + %chosen.1.us.lcssa.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ undef, %for.cond4.preheader.us ] + %indvars.iv.unr.ph = phi i64 [ 2, %for.body8.us.prol ], [ 1, %for.cond4.preheader.us ] + %chosen.086.us.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ 0, %for.cond4.preheader.us ] + br label %for.body8.us.prol.loopexit + +for.body8.us.prol.loopexit: ; preds = %for.body8.us.prol.loopexit.unr-lcssa + br i1 %7, label %for.cond4.for.cond.cleanup7_crit_edge.us, label %for.cond4.preheader.us.new + +for.cond4.preheader.us.new: ; preds = %for.body8.us.prol.loopexit + br label %for.body8.us + +for.body8.us: ; preds = %for.body8.us, %for.cond4.preheader.us.new + %indvars.iv = phi i64 [ %indvars.iv.unr.ph, %for.cond4.preheader.us.new ], [ %indvars.iv.next.1, %for.body8.us ] + %chosen.086.us = phi i32 [ %chosen.086.us.unr.ph, %for.cond4.preheader.us.new ], [ %chosen.1.us.1, %for.body8.us ] + %conv10.us = sext i32 %chosen.086.us to i64 + %add.us = add i64 %conv10.us, %mul.us + %arrayidx11.us = getelementptr inbounds float, float* %5, i64 %add.us + %43 = load float, float* %arrayidx11.us, align 4, !tbaa !20 + %add15.us = add i64 %indvars.iv, %mul.us + %arrayidx16.us = getelementptr inbounds float, float* %5, i64 %add15.us + %44 = load float, float* %arrayidx16.us, align 4, !tbaa !20 + %cmp17.us = fcmp fast olt float %43, %44 + %45 = trunc i64 %indvars.iv to i32 + %chosen.1.us = select i1 %cmp17.us, i32 %45, i32 %chosen.086.us + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %conv10.us.1 = sext i32 %chosen.1.us to i64 + %add.us.1 = add i64 %conv10.us.1, %mul.us + %arrayidx11.us.1 = getelementptr inbounds float, float* %5, i64 %add.us.1 + %46 = load float, float* %arrayidx11.us.1, align 4, !tbaa !20 + %add15.us.1 = add i64 %indvars.iv.next, %mul.us + %arrayidx16.us.1 = getelementptr inbounds float, float* %5, i64 %add15.us.1 + %47 = load float, float* %arrayidx16.us.1, align 4, !tbaa !20 + %cmp17.us.1 = fcmp fast olt float %46, %47 + %48 = trunc i64 %indvars.iv.next to i32 + %chosen.1.us.1 = select i1 %cmp17.us.1, i32 %48, i32 %chosen.1.us + %indvars.iv.next.1 = add nsw i64 %indvars.iv, 2 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %3 + br i1 %exitcond.1, label %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa, label %for.body8.us + +for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa: ; preds = %for.body8.us + br label %for.cond4.for.cond.cleanup7_crit_edge.us + +for.cond4.for.cond.cleanup7_crit_edge.us: ; preds = %for.body8.us.prol.loopexit, %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa + %chosen.1.us.lcssa = phi i32 [ %chosen.1.us.lcssa.unr.ph, %for.body8.us.prol.loopexit ], [ %chosen.1.us.1, %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa ] + %arrayidx18.us = getelementptr inbounds i32, i32* %labels, i64 %indvars.iv95 + %49 = load i32, i32* %arrayidx18.us, align 4, !tbaa !70 + %not.cmp19.us = icmp ne i32 %chosen.1.us.lcssa, %49 + %inc21.us = zext i1 %not.cmp19.us to i32 + %num_errors.0.inc21.us = add nsw i32 %inc21.us, %num_errors.091.us + %indvars.iv.next96 = add nuw nsw i64 %indvars.iv95, 1 + %exitcond97 = icmp eq i64 %indvars.iv.next96, %2 + br i1 %exitcond97, label %for.cond.cleanup.loopexit, label %for.cond4.preheader.us + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader109, %for.cond4.preheader + %indvars.iv98 = phi i64 [ %indvars.iv.next99, %for.cond4.preheader ], [ %indvars.iv98.ph, %for.cond4.preheader.preheader109 ] + %num_errors.091 = phi i32 [ %num_errors.0.inc21, %for.cond4.preheader ], [ %num_errors.091.ph, %for.cond4.preheader.preheader109 ] + %arrayidx18 = getelementptr inbounds i32, i32* %labels, i64 %indvars.iv98 + %50 = load i32, i32* %arrayidx18, align 4, !tbaa !70 + %not.cmp19 = icmp ne i32 %50, 0 + %inc21 = zext i1 %not.cmp19 to i32 + %num_errors.0.inc21 = add nsw i32 %inc21, %num_errors.091 + %indvars.iv.next99 = add nuw nsw i64 %indvars.iv98, 1 + %exitcond100 = icmp eq i64 %indvars.iv.next99, %2 + br i1 %exitcond100, label %for.cond.cleanup.loopexit110, label %for.cond4.preheader, !llvm.loop !72 + +for.cond.cleanup.loopexit: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit110: ; preds = %for.cond4.preheader + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit110, %for.cond.cleanup.loopexit, %middle.block, %entry + %num_errors.0.lcssa = phi i32 [ 0, %entry ], [ %40, %middle.block ], [ %num_errors.0.inc21.us, %for.cond.cleanup.loopexit ], [ %num_errors.0.inc21, %for.cond.cleanup.loopexit110 ] + %conv26 = sext i32 %num_errors.0.lcssa to i64 + %sub = sub i64 %2, %conv26 %conv27 = uitofp i64 %sub to double %conv29 = uitofp i64 %2 to double %div = fdiv fast double %conv27, %conv29 %mul31 = fmul fast double %div, 1.000000e+02 %conv32 = fptrunc double %mul31 to float %conv33 = fpext float %conv32 to double - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.20, i64 0, i64 0), double %conv33) - %call34 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.21, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.22, i64 0, i64 0)) - %cmp35 = icmp eq %struct._IO_FILE* %call34, null - br i1 %cmp35, label %if.end43, label %if.then36 + %call34 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv33) + %call35 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp36 = icmp eq %struct._IO_FILE* %call35, null + br i1 %cmp36, label %if.end43, label %if.then37 -if.then36: ; preds = %for.cond.cleanup - %26 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* - call void @llvm.lifetime.start(i64 376, i8* nonnull %26) #7 - %27 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 - %28 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0 - call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %28) #7 - %29 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !40 +if.then37: ; preds = %for.cond.cleanup + %51 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %51) #2 + %52 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %53 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %52, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %53) #2 + %54 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %52, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %54, align 16, !tbaa !46 %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 - store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !42 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 - store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !45 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 - store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !46 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 - %30 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %30, i8 0, i64 32, i32 8, i1 false) #7 - %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 - %32 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* - store i64 %31, i64* %32, align 16, !tbaa !40 - %33 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 - %34 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** - %vtable.cast.i.i = inttoptr i64 %31 to i8* + %55 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %55, i8 0, i64 32, i32 8, i1 false) #2 + %56 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %57 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %56, i64* %57, align 16, !tbaa !46 + %58 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %59 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %56 to i8* %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 - %35 = bitcast i8* %vbase.offset.ptr.i.i to i64* - %vbase.offset.i.i = load i64, i64* %35, align 8 - %add.ptr.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i - %36 = bitcast i8* %add.ptr.i.i to i64* - store i64 %33, i64* %36, align 8, !tbaa !40 - %vtable3.i.i = load i8*, i8** %34, align 16, !tbaa !40 + %60 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %60, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i.i + %61 = bitcast i8* %add.ptr.i.i to i64* + store i64 %58, i64* %61, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %59, align 16, !tbaa !46 %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 - %37 = bitcast i8* %vbase.offset.ptr4.i.i to i64* - %vbase.offset5.i.i = load i64, i64* %37, align 8 - %add.ptr6.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset5.i.i - %38 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %38, %"class.std::basic_streambuf"* null) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !40 + %62 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %62, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset5.i.i + %63 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %63, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %54, align 16, !tbaa !46 %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 - %39 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 - %40 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* - store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %40, align 16, !tbaa !40 + %64 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %65 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %65, align 16, !tbaa !46 %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 - %41 = bitcast i8** %_M_in_beg.i.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 48, i32 8, i1 false) #7 - call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %39, align 8, !tbaa !40 + %66 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %66, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %64, align 8, !tbaa !46 %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 - store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !47 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 - %42 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 - %43 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** - store %union.anon* %42, %union.anon** %43, align 8, !tbaa !52 + %67 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %68 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %67, %union.anon** %68, align 8, !tbaa !58 %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 - store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !53 - %.cast.i.i.i = bitcast %union.anon* %42 to i8* - store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !36 - %vtable.i = load i8*, i8** %34, align 16, !tbaa !40 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + %.cast.i.i.i = bitcast %union.anon* %67 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %59, align 16, !tbaa !46 %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 - %44 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64, i64* %44, align 8 - %add.ptr2.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i - %45 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* - %46 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %45, %"class.std::basic_streambuf"* %46) #7 - %47 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* - %vtable.i72 = load i8*, i8** %34, align 16, !tbaa !40 - %vbase.offset.ptr.i73 = getelementptr i8, i8* %vtable.i72, i64 -24 - %48 = bitcast i8* %vbase.offset.ptr.i73 to i64* - %vbase.offset.i74 = load i64, i64* %48, align 8 - %add.ptr.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i74 + %69 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %69, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i + %70 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %71 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %70, %"class.std::basic_streambuf"* %71) #2 + %72 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i77 = load i8*, i8** %59, align 16, !tbaa !46 + %vbase.offset.ptr.i78 = getelementptr i8, i8* %vtable.i77, i64 -24 + %73 = bitcast i8* %vbase.offset.ptr.i78 to i64* + %vbase.offset.i79 = load i64, i64* %73, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i79 %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 - %49 = bitcast i8* %_M_flags.i.i to i32* - %50 = load i32, i32* %49, align 8, !tbaa !57 - %and.i.i.i.i = and i32 %50, -261 + %74 = bitcast i8* %_M_flags.i.i to i32* + %75 = load i32, i32* %74, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %75, -261 %or.i.i.i.i = or i32 %and.i.i.i.i, 4 - store i32 %or.i.i.i.i, i32* %49, align 4, !tbaa !54 - %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %47, double %conv33) #7 - %51 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %51) #7 - call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #7 + store i32 %or.i.i.i.i, i32* %74, align 4, !tbaa !60 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %72, double %conv33) #2 + %76 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %76) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 - %52 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !56 + %77 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 - %53 = load i64, i64* %_M_string_length.i, align 8, !tbaa !53 - %call41 = call i64 @fwrite(i8* %52, i64 1, i64 %53, %struct._IO_FILE* nonnull %call34) - %call42 = call i32 @fclose(%struct._IO_FILE* nonnull %call34) - %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !56 - %55 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 - %arraydecay.i.i.i.i = bitcast %union.anon* %55 to i8* - %cmp.i.i.i = icmp eq i8* %54, %arraydecay.i.i.i.i + %78 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call42 = call i64 @fwrite(i8* %77, i64 1, i64 %78, %struct._IO_FILE* nonnull %call35) + %79 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %80 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %80 to i8* + %cmp.i.i.i = icmp eq i8* %79, %arraydecay.i.i.i.i br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i -if.then.i.i: ; preds = %if.then36 - call void @_ZdlPv(i8* %54) #7 +if.then.i.i: ; preds = %if.then37 + call void @_ZdlPv(i8* %79) #2 br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then36, %if.then.i.i - call void @llvm.lifetime.end(i64 32, i8* nonnull %51) #7 - %56 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 - store i64 %56, i64* %32, align 16, !tbaa !40 - %57 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 - %vtable.cast.i.i76 = inttoptr i64 %56 to i8* - %vbase.offset.ptr.i.i77 = getelementptr i8, i8* %vtable.cast.i.i76, i64 -24 - %58 = bitcast i8* %vbase.offset.ptr.i.i77 to i64* - %vbase.offset.i.i78 = load i64, i64* %58, align 8 - %add.ptr.i.i79 = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i78 - %59 = bitcast i8* %add.ptr.i.i79 to i64* - store i64 %57, i64* %59, align 8, !tbaa !40 - %60 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !40 +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then37, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %76) #2 + %81 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %81, i64* %57, align 16, !tbaa !46 + %82 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i81 = inttoptr i64 %81 to i8* + %vbase.offset.ptr.i.i82 = getelementptr i8, i8* %vtable.cast.i.i81, i64 -24 + %83 = bitcast i8* %vbase.offset.ptr.i.i82 to i64* + %vbase.offset.i.i83 = load i64, i64* %83, align 8 + %add.ptr.i.i84 = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i.i83 + %84 = bitcast i8* %add.ptr.i.i84 to i64* + store i64 %82, i64* %84, align 8, !tbaa !46 + %85 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %85, align 8, !tbaa !46 %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 - %61 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !56 - %cmp.i.i.i.i.i.i = icmp eq i8* %61, %.cast.i.i.i + %86 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %86, %.cast.i.i.i br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit - call void @_ZdlPv(i8* %61) #7 + call void @_ZdlPv(i8* %86) #2 br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit _ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !40 - call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #7 - %62 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 - call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %62) #7 - call void @llvm.lifetime.end(i64 376, i8* nonnull %26) #7 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %85, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %87 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %87) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %51) #2 br label %if.end43 if.end43: ; preds = %for.cond.cleanup, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %call44 = call i32 @fclose(%struct._IO_FILE* %call35) + ret float %conv32 +} + +; Function Attrs: norecurse nounwind readnone uwtable +define zeroext i1 @_Z16descendFloatComp9ClassProbS_(i64 %obj1.coerce, i64 %obj2.coerce) #7 { +entry: + %obj1.sroa.0.0.extract.trunc = trunc i64 %obj1.coerce to i32 + %0 = bitcast i32 %obj1.sroa.0.0.extract.trunc to float + %obj2.sroa.0.0.extract.trunc = trunc i64 %obj2.coerce to i32 + %1 = bitcast i32 %obj2.sroa.0.0.extract.trunc to float + %cmp = fcmp fast ogt float %0, %1 + ret i1 %cmp +} + +; Function Attrs: nounwind uwtable +define float @_Z19computeTop5AccuracyPhiPvj(i8* nocapture readonly %labels, i32 %num_labels, i8* nocapture readonly %result_ptr, i32 %num_classes) local_unnamed_addr #3 { +entry: + %elem_probs.sroa.9 = alloca i64, align 8 + %elem_probs.sroa.15 = alloca %struct.ClassProb*, align 8 + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 + %0 = bitcast i8* %dim_sizes to i64** + %1 = load i64*, i64** %0, align 8, !tbaa !14 + %2 = load i64, i64* %1, align 8, !tbaa !15 + %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !15 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 + %4 = bitcast i8* %host_data to float** + %5 = load float*, float** %4, align 8, !tbaa !17 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.24, i64 0, i64 0), i64 %2, i64 %3) + %cmp162 = icmp sgt i32 %num_labels, 0 + br i1 %cmp162, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %elem_probs.sroa.9.0..sroa_cast151 = bitcast i64* %elem_probs.sroa.9 to i8* + %elem_probs.sroa.15.0..sroa_cast149 = bitcast %struct.ClassProb** %elem_probs.sroa.15 to i8* + %cmp5156 = icmp eq i32 %num_classes, 0 + %elem_probs.sroa.9.0._M_finish.i110.sroa_cast = bitcast i64* %elem_probs.sroa.9 to %struct.ClassProb** + %6 = zext i32 %num_classes to i64 + %7 = sext i32 %num_labels to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit + %phitmp = sext i32 %add31.num_errors.0 to i64 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %num_errors.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + %sub = sub i64 %2, %num_errors.0.lcssa + %conv37 = uitofp i64 %sub to double + %conv39 = uitofp i64 %2 to double + %div = fdiv fast double %conv37, %conv39 + %mul41 = fmul fast double %div, 1.000000e+02 + %conv42 = fptrunc double %mul41 to float + %conv43 = fpext float %conv42 to double + %call44 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv43) + %call45 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp46 = icmp eq %struct._IO_FILE* %call45, null + br i1 %cmp46, label %if.end53, label %if.then47 + +for.body: ; preds = %for.body.lr.ph, %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit + %indvars.iv169 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next170, %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit ] + %num_errors.0164 = phi i32 [ 0, %for.body.lr.ph ], [ %add31.num_errors.0, %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit ] + call void @llvm.lifetime.start(i64 8, i8* nonnull %elem_probs.sroa.9.0..sroa_cast151) + call void @llvm.lifetime.start(i64 8, i8* nonnull %elem_probs.sroa.15.0..sroa_cast149) + store i64 0, i64* %elem_probs.sroa.9, align 8 + store %struct.ClassProb* null, %struct.ClassProb** %elem_probs.sroa.15, align 8 + br i1 %cmp5156, label %for.cond.cleanup6, label %for.body7.lr.ph + +for.body7.lr.ph: ; preds = %for.body + %mul = mul i64 %indvars.iv169, %3 + br label %for.body7 + +for.cond.cleanup6.loopexit: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + br label %for.cond.cleanup6 + +for.cond.cleanup6: ; preds = %for.cond.cleanup6.loopexit, %for.body + %elem_probs.sroa.9.0.elem_probs.sroa.9.8.146 = phi i64 [ 0, %for.body ], [ %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre, %for.cond.cleanup6.loopexit ] + %elem_probs.sroa.0.0.lcssa = phi i64 [ 0, %for.body ], [ %elem_probs.sroa.0.1, %for.cond.cleanup6.loopexit ] + %8 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to %struct.ClassProb* + %9 = inttoptr i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8.146 to %struct.ClassProb* + %cmp.i.i.i = icmp eq %struct.ClassProb* %8, %9 + br i1 %cmp.i.i.i, label %for.cond16.preheader, label %if.then.i.i + +for.cond16.preheader.loopexit: ; preds = %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i + br label %for.cond16.preheader + +for.cond16.preheader.loopexit179: ; preds = %for.inc.i.i + br label %for.cond16.preheader + +for.cond16.preheader: ; preds = %for.cond16.preheader.loopexit179, %for.cond16.preheader.loopexit, %for.cond.preheader.i.i, %_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i, %for.cond.cleanup6 + %arrayidx24 = getelementptr inbounds i8, i8* %labels, i64 %indvars.iv169 + %10 = load i8, i8* %arrayidx24, align 1, !tbaa !42 + %conv25 = zext i8 %10 to i32 + %cProb20.sroa.3.0..sroa_idx62 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 0, i32 1 + %cProb20.sroa.3.0.copyload = load i32, i32* %cProb20.sroa.3.0..sroa_idx62, align 4 + %cmp26 = icmp eq i32 %cProb20.sroa.3.0.copyload, %conv25 + %cProb20.sroa.3.0..sroa_idx62.1 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 1, i32 1 + %cProb20.sroa.3.0.copyload.1 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.1, align 4 + %cmp26.1 = icmp eq i32 %cProb20.sroa.3.0.copyload.1, %conv25 + %narrow = or i1 %cmp26.1, %cmp26 + %cProb20.sroa.3.0..sroa_idx62.2 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 2, i32 1 + %cProb20.sroa.3.0.copyload.2 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.2, align 4 + %cmp26.2 = icmp eq i32 %cProb20.sroa.3.0.copyload.2, %conv25 + %narrow174 = or i1 %cmp26.2, %narrow + %cProb20.sroa.3.0..sroa_idx62.3 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 3, i32 1 + %cProb20.sroa.3.0.copyload.3 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.3, align 4 + %cmp26.3 = icmp eq i32 %cProb20.sroa.3.0.copyload.3, %conv25 + %narrow175 = or i1 %cmp26.3, %narrow174 + %cProb20.sroa.3.0..sroa_idx62.4 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 4, i32 1 + %cProb20.sroa.3.0.copyload.4 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.4, align 4 + %cmp26.4 = icmp eq i32 %cProb20.sroa.3.0.copyload.4, %conv25 + %narrow176 = or i1 %cmp26.4, %narrow175 + %11 = xor i1 %narrow176, true + %12 = zext i1 %11 to i32 + %add31.num_errors.0 = add nsw i32 %12, %num_errors.0164 + %tobool.i.i.i117 = icmp eq i64 %elem_probs.sroa.0.0.lcssa, 0 + br i1 %tobool.i.i.i117, label %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit, label %if.then.i.i.i + +if.then.i.i: ; preds = %for.cond.cleanup6 + %sub.ptr.sub.i.i.i = sub i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8.146, %elem_probs.sroa.0.0.lcssa + %sub.ptr.div.i.i.i = ashr exact i64 %sub.ptr.sub.i.i.i, 3 + %13 = tail call i64 @llvm.ctlz.i64(i64 %sub.ptr.div.i.i.i, i1 true) #2 + %sub.i.i.i = shl nuw nsw i64 %13, 1 + %mul.i.i = xor i64 %sub.i.i.i, 126 + tail call void @_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_(%struct.ClassProb* %8, %struct.ClassProb* %9, i64 %mul.i.i, i1 (i64, i64)* nonnull @_Z16descendFloatComp9ClassProbS_) #2 + %cmp.i = icmp sgt i64 %sub.ptr.sub.i.i.i, 128 + br i1 %cmp.i, label %for.body.lr.ph.i30.i, label %for.cond.preheader.i.i + +for.body.lr.ph.i30.i: ; preds = %if.then.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i.i28.i = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i64* + %14 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i8* + br label %for.body.i37.i + +for.body.i37.i: ; preds = %for.inc.i60.i, %for.body.lr.ph.i30.i + %incdec.ptr.i54.i31.idx.i = phi i64 [ 1, %for.body.lr.ph.i30.i ], [ %incdec.ptr.i54.i31.add.i, %for.inc.i60.i ] + %__i.sroa.0.0.sink53.i32.i = phi %struct.ClassProb* [ %8, %for.body.lr.ph.i30.i ], [ %incdec.ptr.i54.i31.ptr.i, %for.inc.i60.i ] + %incdec.ptr.i54.i31.ptr.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 %incdec.ptr.i54.i31.idx.i + %agg.tmp.sroa.0.0..sroa_cast.i.i33.i = bitcast %struct.ClassProb* %incdec.ptr.i54.i31.ptr.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i34.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i33.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i.i35.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i28.i, align 4 + %obj1.sroa.0.0.extract.trunc.i101 = trunc i64 %agg.tmp.sroa.0.0.copyload.i.i34.i to i32 + %15 = bitcast i32 %obj1.sroa.0.0.extract.trunc.i101 to float + %obj2.sroa.0.0.extract.trunc.i102 = trunc i64 %agg.tmp3.sroa.0.0.copyload.i.i35.i to i32 + %16 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i102 to float + %cmp.i103 = fcmp fast ogt float %15, %16 + br i1 %cmp.i103, label %if.then10.i42.i, label %if.else.i50.i + +if.then10.i42.i: ; preds = %for.body.i37.i + %incdec.ptr.i54.i31.ptr.idx.i = shl nuw i64 %incdec.ptr.i54.i31.idx.i, 3 + %sub.ptr.div.i.i.i.i.i40.i = ashr exact i64 %incdec.ptr.i54.i31.ptr.idx.i, 3 + %add.ptr.i41.i43.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.0.sink53.i32.i, i64 2 + %.pre.i.i.i.i.i44.i = sub nsw i64 0, %sub.ptr.div.i.i.i.i.i40.i + %.pre9.i.i.i.i.i45.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %add.ptr.i41.i43.i, i64 %.pre.i.i.i.i.i44.i + %17 = bitcast %struct.ClassProb* %.pre9.i.i.i.i.i45.i to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %17, i8* nonnull %14, i64 %incdec.ptr.i54.i31.ptr.idx.i, i32 4, i1 false) #2 + br label %for.inc.i60.i + +if.else.i50.i: ; preds = %for.body.i37.i + %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i47.i = bitcast %struct.ClassProb* %__i.sroa.0.0.sink53.i32.i to i64* + %agg.tmp2.sroa.0.0.copyload.i27.i.i48.i = load i64, i64* %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i47.i, align 4 + %obj2.sroa.0.0.extract.trunc.i90 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i27.i.i48.i to i32 + %18 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i90 to float + %cmp.i91 = fcmp fast ogt float %15, %18 + br i1 %cmp.i91, label %while.body.i.i56.i.preheader, label %for.inc.i60.i + +while.body.i.i56.i.preheader: ; preds = %if.else.i50.i + br label %while.body.i.i56.i + +while.body.i.i56.i: ; preds = %while.body.i.i56.i.preheader, %while.body.i.i56.i + %19 = phi i64 [ %agg.tmp2.sroa.0.0.copyload.i.i.i54.i, %while.body.i.i56.i ], [ %agg.tmp2.sroa.0.0.copyload.i27.i.i48.i, %while.body.i.i56.i.preheader ] + %20 = phi i64* [ %indvars55.i51.i, %while.body.i.i56.i ], [ %agg.tmp.sroa.0.0..sroa_cast.i.i33.i, %while.body.i.i56.i.preheader ] + %21 = phi %struct.ClassProb* [ %incdec.ptr.i.i.i52.i, %while.body.i.i56.i ], [ %__i.sroa.0.0.sink53.i32.i, %while.body.i.i56.i.preheader ] + %indvars55.i51.i = bitcast %struct.ClassProb* %21 to i64* + store i64 %19, i64* %20, align 4 + %incdec.ptr.i.i.i52.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %21, i64 -1 + %indvars.i53.i = bitcast %struct.ClassProb* %incdec.ptr.i.i.i52.i to i64* + %agg.tmp2.sroa.0.0.copyload.i.i.i54.i = load i64, i64* %indvars.i53.i, align 4 + %obj2.sroa.0.0.extract.trunc.i99 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i.i.i54.i to i32 + %22 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i99 to float + %cmp.i100 = fcmp fast ogt float %15, %22 + br i1 %cmp.i100, label %while.body.i.i56.i, label %for.inc.i60.i.loopexit + +for.inc.i60.i.loopexit: ; preds = %while.body.i.i56.i + br label %for.inc.i60.i + +for.inc.i60.i: ; preds = %for.inc.i60.i.loopexit, %if.else.i50.i, %if.then10.i42.i + %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i57.i = phi i64* [ %agg.tmp.sroa.0.0..sroa_cast.i.i33.i, %if.else.i50.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i.i28.i, %if.then10.i42.i ], [ %indvars55.i51.i, %for.inc.i60.i.loopexit ] + store i64 %agg.tmp.sroa.0.0.copyload.i.i34.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i57.i, align 4 + %incdec.ptr.i54.i31.add.i = add nuw nsw i64 %incdec.ptr.i54.i31.idx.i, 1 + %cmp.i38.i59.i = icmp eq i64 %incdec.ptr.i54.i31.add.i, 16 + br i1 %cmp.i38.i59.i, label %_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i, label %for.body.i37.i + +_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i: ; preds = %for.inc.i60.i + %add.ptr.i.i86 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 16 + %cmp.i13.i.i = icmp eq %struct.ClassProb* %add.ptr.i.i86, %9 + br i1 %cmp.i13.i.i, label %for.cond16.preheader, label %for.body.i67.i.preheader + +for.body.i67.i.preheader: ; preds = %_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i + br label %for.body.i67.i + +for.body.i67.i: ; preds = %for.body.i67.i.preheader, %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i + %__i.sroa.0.014.i.i = phi %struct.ClassProb* [ %incdec.ptr.i.i73.i, %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i ], [ %add.ptr.i.i86, %for.body.i67.i.preheader ] + %23 = bitcast %struct.ClassProb* %__i.sroa.0.014.i.i to i64* + %24 = load i64, i64* %23, align 4 + %incdec.ptr.i25.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.014.i.i, i64 -1 + %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i64.i = bitcast %struct.ClassProb* %incdec.ptr.i25.i.i.i to i64* + %agg.tmp2.sroa.0.0.copyload.i27.i.i65.i = load i64, i64* %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i64.i, align 4 + %obj1.sroa.0.0.extract.trunc.i = trunc i64 %24 to i32 + %25 = bitcast i32 %obj1.sroa.0.0.extract.trunc.i to float + %obj2.sroa.0.0.extract.trunc.i = trunc i64 %agg.tmp2.sroa.0.0.copyload.i27.i.i65.i to i32 + %26 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i to float + %cmp.i88 = fcmp fast ogt float %25, %26 + br i1 %cmp.i88, label %while.body.i.i72.i.preheader, label %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i + +while.body.i.i72.i.preheader: ; preds = %for.body.i67.i + br label %while.body.i.i72.i + +while.body.i.i72.i: ; preds = %while.body.i.i72.i.preheader, %while.body.i.i72.i + %27 = phi i64 [ %agg.tmp2.sroa.0.0.copyload.i.i.i70.i, %while.body.i.i72.i ], [ %agg.tmp2.sroa.0.0.copyload.i27.i.i65.i, %while.body.i.i72.i.preheader ] + %28 = phi i64* [ %indvars15.i.i, %while.body.i.i72.i ], [ %23, %while.body.i.i72.i.preheader ] + %29 = phi %struct.ClassProb* [ %incdec.ptr.i.i.i68.i, %while.body.i.i72.i ], [ %incdec.ptr.i25.i.i.i, %while.body.i.i72.i.preheader ] + %indvars15.i.i = bitcast %struct.ClassProb* %29 to i64* + store i64 %27, i64* %28, align 4 + %incdec.ptr.i.i.i68.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %29, i64 -1 + %indvars.i69.i = bitcast %struct.ClassProb* %incdec.ptr.i.i.i68.i to i64* + %agg.tmp2.sroa.0.0.copyload.i.i.i70.i = load i64, i64* %indvars.i69.i, align 4 + %obj2.sroa.0.0.extract.trunc.i96 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i.i.i70.i to i32 + %30 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i96 to float + %cmp.i97 = fcmp fast ogt float %25, %30 + br i1 %cmp.i97, label %while.body.i.i72.i, label %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit + +_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit: ; preds = %while.body.i.i72.i + br label %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i + +_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i: ; preds = %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit, %for.body.i67.i + %.lcssa.i.i.i = phi i64* [ %23, %for.body.i67.i ], [ %indvars15.i.i, %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit ] + store i64 %24, i64* %.lcssa.i.i.i, align 4 + %incdec.ptr.i.i73.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.014.i.i, i64 1 + %cmp.i.i74.i = icmp eq %struct.ClassProb* %incdec.ptr.i.i73.i, %9 + br i1 %cmp.i.i74.i, label %for.cond16.preheader.loopexit, label %for.body.i67.i + +for.cond.preheader.i.i: ; preds = %if.then.i.i + %incdec.ptr.i51.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 1 + %cmp.i3852.i.i = icmp eq %struct.ClassProb* %incdec.ptr.i51.i.i, %9 + br i1 %cmp.i3852.i.i, label %for.cond16.preheader, label %for.body.lr.ph.i.i + +for.body.lr.ph.i.i: ; preds = %for.cond.preheader.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i.i.i = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i64* + %31 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i8* + br label %for.body.i.i + +for.body.i.i: ; preds = %for.inc.i.i, %for.body.lr.ph.i.i + %incdec.ptr.i54.i.i = phi %struct.ClassProb* [ %incdec.ptr.i51.i.i, %for.body.lr.ph.i.i ], [ %incdec.ptr.i.i.i, %for.inc.i.i ] + %__i.sroa.0.0.sink53.i.i = phi %struct.ClassProb* [ %8, %for.body.lr.ph.i.i ], [ %incdec.ptr.i54.i.i, %for.inc.i.i ] + %agg.tmp.sroa.0.0..sroa_cast.i.i.i = bitcast %struct.ClassProb* %incdec.ptr.i54.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i.i, align 4 + %obj1.sroa.0.0.extract.trunc.i107 = trunc i64 %agg.tmp.sroa.0.0.copyload.i.i.i to i32 + %32 = bitcast i32 %obj1.sroa.0.0.extract.trunc.i107 to float + %obj2.sroa.0.0.extract.trunc.i108 = trunc i64 %agg.tmp3.sroa.0.0.copyload.i.i.i to i32 + %33 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i108 to float + %cmp.i109 = fcmp fast ogt float %32, %33 + br i1 %cmp.i109, label %if.then10.i.i, label %if.else.i.i + +if.then10.i.i: ; preds = %for.body.i.i + %sub.ptr.lhs.cast.i.i.i.i.i.i = ptrtoint %struct.ClassProb* %incdec.ptr.i54.i.i to i64 + %sub.ptr.sub.i.i.i.i.i.i = sub i64 %sub.ptr.lhs.cast.i.i.i.i.i.i, %elem_probs.sroa.0.0.lcssa + %sub.ptr.div.i.i.i.i.i.i = ashr exact i64 %sub.ptr.sub.i.i.i.i.i.i, 3 + %tobool.i.i.i.i.i.i = icmp eq i64 %sub.ptr.div.i.i.i.i.i.i, 0 + br i1 %tobool.i.i.i.i.i.i, label %for.inc.i.i, label %if.then.i.i.i.i.i.i + +if.then.i.i.i.i.i.i: ; preds = %if.then10.i.i + %add.ptr.i41.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.0.sink53.i.i, i64 2 + %.pre.i.i.i.i.i.i = sub nsw i64 0, %sub.ptr.div.i.i.i.i.i.i + %.pre9.i.i.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %add.ptr.i41.i.i, i64 %.pre.i.i.i.i.i.i + %34 = bitcast %struct.ClassProb* %.pre9.i.i.i.i.i.i to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %34, i8* nonnull %31, i64 %sub.ptr.sub.i.i.i.i.i.i, i32 4, i1 false) #2 + br label %for.inc.i.i + +if.else.i.i: ; preds = %for.body.i.i + %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i.i = bitcast %struct.ClassProb* %__i.sroa.0.0.sink53.i.i to i64* + %agg.tmp2.sroa.0.0.copyload.i27.i.i.i = load i64, i64* %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i.i, align 4 + %obj2.sroa.0.0.extract.trunc.i93 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i27.i.i.i to i32 + %35 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i93 to float + %cmp.i94 = fcmp fast ogt float %32, %35 + br i1 %cmp.i94, label %while.body.i.i.i.preheader, label %for.inc.i.i + +while.body.i.i.i.preheader: ; preds = %if.else.i.i + br label %while.body.i.i.i + +while.body.i.i.i: ; preds = %while.body.i.i.i.preheader, %while.body.i.i.i + %36 = phi i64 [ %agg.tmp2.sroa.0.0.copyload.i.i.i.i, %while.body.i.i.i ], [ %agg.tmp2.sroa.0.0.copyload.i27.i.i.i, %while.body.i.i.i.preheader ] + %37 = phi i64* [ %indvars55.i.i, %while.body.i.i.i ], [ %agg.tmp.sroa.0.0..sroa_cast.i.i.i, %while.body.i.i.i.preheader ] + %38 = phi %struct.ClassProb* [ %incdec.ptr.i.i.i.i, %while.body.i.i.i ], [ %__i.sroa.0.0.sink53.i.i, %while.body.i.i.i.preheader ] + %indvars55.i.i = bitcast %struct.ClassProb* %38 to i64* + store i64 %36, i64* %37, align 4 + %incdec.ptr.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %38, i64 -1 + %indvars.i.i = bitcast %struct.ClassProb* %incdec.ptr.i.i.i.i to i64* + %agg.tmp2.sroa.0.0.copyload.i.i.i.i = load i64, i64* %indvars.i.i, align 4 + %obj2.sroa.0.0.extract.trunc.i105 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i.i.i.i to i32 + %39 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i105 to float + %cmp.i106 = fcmp fast ogt float %32, %39 + br i1 %cmp.i106, label %while.body.i.i.i, label %for.inc.i.i.loopexit + +for.inc.i.i.loopexit: ; preds = %while.body.i.i.i + br label %for.inc.i.i + +for.inc.i.i: ; preds = %for.inc.i.i.loopexit, %if.else.i.i, %if.then.i.i.i.i.i.i, %if.then10.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i.i = phi i64* [ %agg.tmp.sroa.0.0..sroa_cast.i.i.i, %if.else.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i.i.i, %if.then10.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i.i.i, %if.then.i.i.i.i.i.i ], [ %indvars55.i.i, %for.inc.i.i.loopexit ] + store i64 %agg.tmp.sroa.0.0.copyload.i.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i.i, align 4 + %incdec.ptr.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %incdec.ptr.i54.i.i, i64 1 + %cmp.i38.i.i = icmp eq %struct.ClassProb* %incdec.ptr.i.i.i, %9 + br i1 %cmp.i38.i.i, label %for.cond16.preheader.loopexit179, label %for.body.i.i + +for.body7: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge, %for.body7.lr.ph + %elem_probs.sroa.15.0.elem_probs.sroa.15.16. = phi %struct.ClassProb* [ null, %for.body7.lr.ph ], [ %elem_probs.sroa.15.0.elem_probs.sroa.15.16..pre, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i = phi i64 [ 0, %for.body7.lr.ph ], [ %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %indvars.iv = phi i64 [ 0, %for.body7.lr.ph ], [ %indvars.iv.next, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %elem_probs.sroa.0.0157 = phi i64 [ 0, %for.body7.lr.ph ], [ %elem_probs.sroa.0.1, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %add = add i64 %indvars.iv, %mul + %arrayidx9 = getelementptr inbounds float, float* %5, i64 %add + %40 = bitcast float* %arrayidx9 to i32* + %41 = load i32, i32* %40, align 4, !tbaa !20 + %42 = inttoptr i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i to %struct.ClassProb* + %cmp.i111 = icmp eq %struct.ClassProb* %42, %elem_probs.sroa.15.0.elem_probs.sroa.15.16. + br i1 %cmp.i111, label %if.else.i112, label %if.then.i + +if.then.i: ; preds = %for.body7 + %43 = inttoptr i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i to i64* + %cProb.sroa.5.0.insert.shift = shl nuw i64 %indvars.iv, 32 + %cProb.sroa.0.0.insert.ext = zext i32 %41 to i64 + %cProb.sroa.0.0.insert.insert = or i64 %cProb.sroa.0.0.insert.ext, %cProb.sroa.5.0.insert.shift + store i64 %cProb.sroa.0.0.insert.insert, i64* %43, align 4 + %elem_probs.sroa.9.0.elem_probs.sroa.9.8.145152 = load %struct.ClassProb*, %struct.ClassProb** %elem_probs.sroa.9.0._M_finish.i110.sroa_cast, align 8 + %incdec.ptr.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %elem_probs.sroa.9.0.elem_probs.sroa.9.8.145152, i64 1 + br label %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + +if.else.i112: ; preds = %for.body7 + %sub.ptr.sub.i21.i.i.i = sub i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i, %elem_probs.sroa.0.0157 + %sub.ptr.div.i22.i.i.i = ashr exact i64 %sub.ptr.sub.i21.i.i.i, 3 + %cmp.i.i.i.i = icmp eq i64 %sub.ptr.div.i22.i.i.i, 0 + %.sroa.speculated.i.i.i = select i1 %cmp.i.i.i.i, i64 1, i64 %sub.ptr.div.i22.i.i.i + %add.i.i.i = add nsw i64 %.sroa.speculated.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp7.i.i.i = icmp ult i64 %add.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp9.i.i.i = icmp ugt i64 %add.i.i.i, 2305843009213693951 + %or.cond.i.i.i = or i1 %cmp7.i.i.i, %cmp9.i.i.i + %cond.i.i.i = select i1 %or.cond.i.i.i, i64 2305843009213693951, i64 %add.i.i.i + %cmp.i35.i.i = icmp eq i64 %cond.i.i.i, 0 + br i1 %cmp.i35.i.i, label %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i, label %cond.true.i.i.i + +cond.true.i.i.i: ; preds = %if.else.i112 + %cmp.i.i.i.i.i = icmp ugt i64 %cond.i.i.i, 2305843009213693951 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i113, label %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i + +if.then.i.i.i.i.i113: ; preds = %cond.true.i.i.i + tail call void @_ZSt17__throw_bad_allocv() #13 + unreachable + +_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i: ; preds = %cond.true.i.i.i + %mul.i.i.i.i.i = shl i64 %cond.i.i.i, 3 + %call2.i.i.i.i.i = tail call i8* @_Znwm(i64 %mul.i.i.i.i.i) #2 + %44 = bitcast i8* %call2.i.i.i.i.i to %struct.ClassProb* + br label %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i + +_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i: ; preds = %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i, %if.else.i112 + %45 = phi i8* [ %call2.i.i.i.i.i, %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i ], [ null, %if.else.i112 ] + %cond.i36.i.i = phi %struct.ClassProb* [ %44, %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i ], [ null, %if.else.i112 ] + %add.ptr.i.i116 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %cond.i36.i.i, i64 %sub.ptr.div.i22.i.i.i + %46 = bitcast %struct.ClassProb* %add.ptr.i.i116 to i64* + %cProb.sroa.5.0.insert.shift137 = shl nuw i64 %indvars.iv, 32 + %cProb.sroa.0.0.insert.ext131 = zext i32 %41 to i64 + %cProb.sroa.0.0.insert.insert133 = or i64 %cProb.sroa.0.0.insert.ext131, %cProb.sroa.5.0.insert.shift137 + store i64 %cProb.sroa.0.0.insert.insert133, i64* %46, align 4 + br i1 %cmp.i.i.i.i, label %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i + +if.then.i.i.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i + %47 = inttoptr i64 %elem_probs.sroa.0.0157 to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %45, i8* %47, i64 %sub.ptr.sub.i21.i.i.i, i32 4, i1 false) #2 + br label %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i + +_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i: ; preds = %if.then.i.i.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i + %incdec.ptr.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %add.ptr.i.i116, i64 1 + %tobool.i.i.i = icmp eq i64 %elem_probs.sroa.0.0157, 0 + br i1 %tobool.i.i.i, label %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i, label %if.then.i37.i.i + +if.then.i37.i.i: ; preds = %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i + %48 = inttoptr i64 %elem_probs.sroa.0.0157 to i8* + tail call void @_ZdlPv(i8* %48) #2 + br label %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i + +_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i: ; preds = %if.then.i37.i.i, %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i + %49 = ptrtoint i8* %45 to i64 + %50 = ptrtoint %struct.ClassProb* %incdec.ptr.i.i to i64 + store i64 %50, i64* %elem_probs.sroa.9, align 8 + %add.ptr23.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %cond.i36.i.i, i64 %cond.i.i.i + br label %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + +_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit: ; preds = %if.then.i, %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i + %elem_probs.sroa.0.1 = phi i64 [ %49, %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i ], [ %elem_probs.sroa.0.0157, %if.then.i ] + %_M_end_of_storage.sink.i = phi %struct.ClassProb** [ %elem_probs.sroa.15, %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i ], [ %elem_probs.sroa.9.0._M_finish.i110.sroa_cast, %if.then.i ] + %add.ptr23.i.sink.i = phi %struct.ClassProb* [ %add.ptr23.i.i, %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i ], [ %incdec.ptr.i, %if.then.i ] + store %struct.ClassProb* %add.ptr23.i.sink.i, %struct.ClassProb** %_M_end_of_storage.sink.i, align 8, !tbaa !73 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp5 = icmp ult i64 %indvars.iv.next, %6 + %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre = load i64, i64* %elem_probs.sroa.9, align 8 + br i1 %cmp5, label %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge, label %for.cond.cleanup6.loopexit + +_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + %elem_probs.sroa.15.0.elem_probs.sroa.15.16..pre = load %struct.ClassProb*, %struct.ClassProb** %elem_probs.sroa.15, align 8 + br label %for.body7 + +if.then.i.i.i: ; preds = %for.cond16.preheader + %51 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i8* + tail call void @_ZdlPv(i8* %51) #2 + br label %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit + +_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit: ; preds = %for.cond16.preheader, %if.then.i.i.i + call void @llvm.lifetime.end(i64 8, i8* nonnull %elem_probs.sroa.9.0..sroa_cast151) + call void @llvm.lifetime.end(i64 8, i8* nonnull %elem_probs.sroa.15.0..sroa_cast149) + %indvars.iv.next170 = add nuw nsw i64 %indvars.iv169, 1 + %cmp = icmp slt i64 %indvars.iv.next170, %7 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit + +if.then47: ; preds = %for.cond.cleanup + %52 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %52) #2 + %53 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %54 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %53, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %54) #2 + %55 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %53, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %55, align 16, !tbaa !46 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %56 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %56, i8 0, i64 32, i32 8, i1 false) #2 + %57 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %58 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %57, i64* %58, align 16, !tbaa !46 + %59 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %60 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i118 = inttoptr i64 %57 to i8* + %vbase.offset.ptr.i.i119 = getelementptr i8, i8* %vtable.cast.i.i118, i64 -24 + %61 = bitcast i8* %vbase.offset.ptr.i.i119 to i64* + %vbase.offset.i.i120 = load i64, i64* %61, align 8 + %add.ptr.i.i121 = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i.i120 + %62 = bitcast i8* %add.ptr.i.i121 to i64* + store i64 %59, i64* %62, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %60, align 16, !tbaa !46 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %63 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %63, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %52, i64 %vbase.offset5.i.i + %64 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %64, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %55, align 16, !tbaa !46 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %65 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %66 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %66, align 16, !tbaa !46 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %67 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %67, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %65, align 8, !tbaa !46 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %68 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %69 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %68, %union.anon** %69, align 8, !tbaa !58 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + %.cast.i.i.i = bitcast %union.anon* %68 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %60, align 16, !tbaa !46 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %70 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %70, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i + %71 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %72 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %71, %"class.std::basic_streambuf"* %72) #2 + %73 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i122 = load i8*, i8** %60, align 16, !tbaa !46 + %vbase.offset.ptr.i123 = getelementptr i8, i8* %vtable.i122, i64 -24 + %74 = bitcast i8* %vbase.offset.ptr.i123 to i64* + %vbase.offset.i124 = load i64, i64* %74, align 8 + %add.ptr.i125 = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i124 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i125, i64 24 + %75 = bitcast i8* %_M_flags.i.i to i32* + %76 = load i32, i32* %75, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %76, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %75, align 4, !tbaa !60 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %73, double %conv43) #2 + %77 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %77) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %78 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %79 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call52 = call i64 @fwrite(i8* %78, i64 1, i64 %79, %struct._IO_FILE* nonnull %call45) + %80 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %81 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %81 to i8* + %cmp.i.i.i127 = icmp eq i8* %80, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i127, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i128 + +if.then.i.i128: ; preds = %if.then47 + call void @_ZdlPv(i8* %80) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then47, %if.then.i.i128 + call void @llvm.lifetime.end(i64 32, i8* nonnull %77) #2 + %82 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %82, i64* %58, align 16, !tbaa !46 + %83 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i = inttoptr i64 %82 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %84 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %84, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i.i + %85 = bitcast i8* %add.ptr.i.i to i64* + store i64 %83, i64* %85, align 8, !tbaa !46 + %86 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %86, align 8, !tbaa !46 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %87 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %87, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %87) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %86, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %88 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %88) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %52) #2 + br label %if.end53 + +if.end53: ; preds = %for.cond.cleanup, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %call54 = call i32 @fclose(%struct._IO_FILE* %call45) + ret float %conv42 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #4 + +; Function Attrs: nounwind uwtable +define void @_Z17dumpFinalAccuracyf(float %accuracy) local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %conv = fpext float %accuracy to double + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.26, i64 0, i64 0), double %conv) + %call1 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call1, null + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %0 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %0) #2 + %1 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %2 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %2) #2 + %3 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !46 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %4 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 32, i32 8, i1 false) #2 + %5 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %6 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %5, i64* %6, align 16, !tbaa !46 + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %5 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %9 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %9, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i + %10 = bitcast i8* %add.ptr.i.i to i64* + store i64 %7, i64* %10, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %11, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset5.i.i + %12 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %12, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !46 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %13 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %14 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %14, align 16, !tbaa !46 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %15 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %15, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %13, align 8, !tbaa !46 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %16 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %17 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %16, %union.anon** %17, align 8, !tbaa !58 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + %.cast.i.i.i = bitcast %union.anon* %16 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %18 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %18, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i + %19 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %20 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %19, %"class.std::basic_streambuf"* %20) #2 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i11 = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr.i12 = getelementptr i8, i8* %vtable.i11, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i12 to i64* + %vbase.offset.i13 = load i64, i64* %22, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i13 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %23 = bitcast i8* %_M_flags.i.i to i32* + %24 = load i32, i32* %23, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %24, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %23, align 4, !tbaa !60 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %21, double %conv) #2 + %25 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %25) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %26 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %27 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call6 = call i64 @fwrite(i8* %26, i64 1, i64 %27, %struct._IO_FILE* nonnull %call1) + %28 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %29 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %29 to i8* + %cmp.i.i.i = icmp eq i8* %28, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then + call void @_ZdlPv(i8* %28) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %25) #2 + %30 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %30, i64* %6, align 16, !tbaa !46 + %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i15 = inttoptr i64 %30 to i8* + %vbase.offset.ptr.i.i16 = getelementptr i8, i8* %vtable.cast.i.i15, i64 -24 + %32 = bitcast i8* %vbase.offset.ptr.i.i16 to i64* + %vbase.offset.i.i17 = load i64, i64* %32, align 8 + %add.ptr.i.i18 = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i17 + %33 = bitcast i8* %add.ptr.i.i18 to i64* + store i64 %31, i64* %33, align 8, !tbaa !46 + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %35, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %35) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %36 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %36) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %0) #2 + br label %if.end + +if.end: ; preds = %entry, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %call7 = call i32 @fclose(%struct._IO_FILE* %call1) + %37 = load float*, float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1), align 8, !tbaa !74 + %38 = load float*, float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 2), align 8, !tbaa !75 + %cmp.i = icmp eq float* %37, %38 + %39 = ptrtoint float* %37 to i64 + br i1 %cmp.i, label %if.else.i, label %if.then.i + +if.then.i: ; preds = %if.end + store float %accuracy, float* %37, align 4, !tbaa !20 + %incdec.ptr.i = getelementptr inbounds float, float* %37, i64 1 + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +if.else.i: ; preds = %if.end + %40 = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !1 + %sub.ptr.sub.i21.i.i.i = sub i64 %39, %40 + %sub.ptr.div.i22.i.i.i = ashr exact i64 %sub.ptr.sub.i21.i.i.i, 2 + %cmp.i.i.i.i = icmp eq i64 %sub.ptr.div.i22.i.i.i, 0 + %.sroa.speculated.i.i.i = select i1 %cmp.i.i.i.i, i64 1, i64 %sub.ptr.div.i22.i.i.i + %add.i.i.i = add nsw i64 %.sroa.speculated.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp7.i.i.i = icmp ult i64 %add.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp9.i.i.i = icmp ugt i64 %add.i.i.i, 4611686018427387903 + %or.cond.i.i.i = or i1 %cmp7.i.i.i, %cmp9.i.i.i + %cond.i.i.i = select i1 %or.cond.i.i.i, i64 4611686018427387903, i64 %add.i.i.i + %cmp.i35.i.i = icmp eq i64 %cond.i.i.i, 0 + br i1 %cmp.i35.i.i, label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i, label %cond.true.i.i.i + +cond.true.i.i.i: ; preds = %if.else.i + %cmp.i.i.i.i.i = icmp ugt i64 %cond.i.i.i, 4611686018427387903 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i19, label %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i + +if.then.i.i.i.i.i19: ; preds = %cond.true.i.i.i + call void @_ZSt17__throw_bad_allocv() #13 + unreachable + +_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i: ; preds = %cond.true.i.i.i + %mul.i.i.i.i.i = shl i64 %cond.i.i.i, 2 + %call2.i.i.i.i.i = call i8* @_Znwm(i64 %mul.i.i.i.i.i) #2 + %41 = bitcast i8* %call2.i.i.i.i.i to float* + %.pre.i.i = load i64, i64* bitcast (float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1) to i64*), align 8, !tbaa !74 + %.pre38.i.i = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !1 + br label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i: ; preds = %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i, %if.else.i + %.in.i.i = phi i64 [ %.pre38.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ %40, %if.else.i ] + %42 = phi i64 [ %.pre.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ %39, %if.else.i ] + %43 = phi i8* [ %call2.i.i.i.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %cond.i36.i.i = phi float* [ %41, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %sub.ptr.sub.i.i.i = sub i64 %42, %.in.i.i + %sub.ptr.div.i.i.i = ashr exact i64 %sub.ptr.sub.i.i.i, 2 + %add.ptr.i.i20 = getelementptr inbounds float, float* %cond.i36.i.i, i64 %sub.ptr.div.i.i.i + store float %accuracy, float* %add.ptr.i.i20, align 4, !tbaa !20 + %tobool.i.i.i.i.i.i.i.i.i.i = icmp eq i64 %sub.ptr.div.i.i.i, 0 + br i1 %tobool.i.i.i.i.i.i.i.i.i.i, label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i + +if.then.i.i.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %44 = inttoptr i64 %.in.i.i to i8* + call void @llvm.memmove.p0i8.p0i8.i64(i8* %43, i8* %44, i64 %sub.ptr.sub.i.i.i, i32 4, i1 false) #2 + br label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + +_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i: ; preds = %if.then.i.i.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %incdec.ptr.i.i = getelementptr inbounds float, float* %add.ptr.i.i20, i64 1 + %tobool.i.i.i = icmp eq i64 %.in.i.i, 0 + br i1 %tobool.i.i.i, label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i, label %if.then.i37.i.i + +if.then.i37.i.i: ; preds = %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + %45 = inttoptr i64 %.in.i.i to i8* + call void @_ZdlPv(i8* %45) #2 + br label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i + +_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i: ; preds = %if.then.i37.i.i, %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + store i8* %43, i8** bitcast (%"class.std::vector"* @run_accuracies to i8**), align 8, !tbaa !1 + store float* %incdec.ptr.i.i, float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1), align 8, !tbaa !74 + %add.ptr23.i.i = getelementptr inbounds float, float* %cond.i36.i.i, i64 %cond.i.i.i + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +_ZNSt6vectorIfSaIfEE9push_backERKf.exit: ; preds = %if.then.i, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i + %_M_end_of_storage.sink.i = phi float** [ getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 2), %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1), %if.then.i ] + %add.ptr23.i.sink.i = phi float* [ %add.ptr23.i.i, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %incdec.ptr.i, %if.then.i ] + store float* %add.ptr23.i.sink.i, float** %_M_end_of_storage.sink.i, align 8, !tbaa !73 ret void } ; Function Attrs: nounwind uwtable -define void @_Z10var_0_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11dumpAvgPSNRf(float %avg_psnr) local_unnamed_addr #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.27, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %0 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %0) #2 + %1 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %2 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %2) #2 + %3 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !46 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %4 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 32, i32 8, i1 false) #2 + %5 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %6 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %5, i64* %6, align 16, !tbaa !46 + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %5 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %9 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %9, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i + %10 = bitcast i8* %add.ptr.i.i to i64* + store i64 %7, i64* %10, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %11, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset5.i.i + %12 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %12, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !46 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %13 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %14 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %14, align 16, !tbaa !46 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %15 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %15, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %13, align 8, !tbaa !46 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %16 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %17 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %16, %union.anon** %17, align 8, !tbaa !58 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + %.cast.i.i.i = bitcast %union.anon* %16 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %18 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %18, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i + %19 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %20 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %19, %"class.std::basic_streambuf"* %20) #2 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i10 = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr.i11 = getelementptr i8, i8* %vtable.i10, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i11 to i64* + %vbase.offset.i12 = load i64, i64* %22, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i12 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %23 = bitcast i8* %_M_flags.i.i to i32* + %24 = load i32, i32* %23, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %24, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %23, align 4, !tbaa !60 + %conv.i = fpext float %avg_psnr to double + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %21, double %conv.i) #2 + %25 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %25) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %26 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %27 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call5 = call i64 @fwrite(i8* %26, i64 1, i64 %27, %struct._IO_FILE* nonnull %call) + %28 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %29 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %29 to i8* + %cmp.i.i.i = icmp eq i8* %28, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then + call void @_ZdlPv(i8* %28) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %25) #2 + %30 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %30, i64* %6, align 16, !tbaa !46 + %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i14 = inttoptr i64 %30 to i8* + %vbase.offset.ptr.i.i15 = getelementptr i8, i8* %vtable.cast.i.i14, i64 -24 + %32 = bitcast i8* %vbase.offset.ptr.i.i15 to i64* + %vbase.offset.i.i16 = load i64, i64* %32, align 8 + %add.ptr.i.i17 = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i16 + %33 = bitcast i8* %add.ptr.i.i17 to i64* + store i64 %31, i64* %33, align 8, !tbaa !46 + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %35, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %35) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %36 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %36) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %0) #2 + br label %if.end + +if.end: ; preds = %entry, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %call6 = call i32 @fclose(%struct._IO_FILE* %call) ret void } -declare void @__visc__hint(i32) local_unnamed_addr #3 +; Function Attrs: nounwind uwtable +define void @_Z11dumpPSNRStdf(float %psnr_std) local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.28, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %0 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %0) #2 + %1 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %2 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %2) #2 + %3 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !46 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %4 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 32, i32 8, i1 false) #2 + %5 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %6 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %5, i64* %6, align 16, !tbaa !46 + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %5 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %9 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %9, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i + %10 = bitcast i8* %add.ptr.i.i to i64* + store i64 %7, i64* %10, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %11, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset5.i.i + %12 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %12, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !46 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %13 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %14 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %14, align 16, !tbaa !46 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %15 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %15, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %13, align 8, !tbaa !46 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %16 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %17 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %16, %union.anon** %17, align 8, !tbaa !58 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + %.cast.i.i.i = bitcast %union.anon* %16 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %18 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %18, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i + %19 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %20 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %19, %"class.std::basic_streambuf"* %20) #2 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i10 = load i8*, i8** %8, align 16, !tbaa !46 + %vbase.offset.ptr.i11 = getelementptr i8, i8* %vtable.i10, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i11 to i64* + %vbase.offset.i12 = load i64, i64* %22, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i12 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %23 = bitcast i8* %_M_flags.i.i to i32* + %24 = load i32, i32* %23, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %24, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %23, align 4, !tbaa !60 + %conv.i = fpext float %psnr_std to double + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %21, double %conv.i) #2 + %25 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %25) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %26 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %27 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call5 = call i64 @fwrite(i8* %26, i64 1, i64 %27, %struct._IO_FILE* nonnull %call) + %28 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %29 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %29 to i8* + %cmp.i.i.i = icmp eq i8* %28, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then + call void @_ZdlPv(i8* %28) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %25) #2 + %30 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %30, i64* %6, align 16, !tbaa !46 + %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i14 = inttoptr i64 %30 to i8* + %vbase.offset.ptr.i.i15 = getelementptr i8, i8* %vtable.cast.i.i14, i64 -24 + %32 = bitcast i8* %vbase.offset.ptr.i.i15 to i64* + %vbase.offset.i.i16 = load i64, i64* %32, align 8 + %add.ptr.i.i17 = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i16 + %33 = bitcast i8* %add.ptr.i.i17 to i64* + store i64 %31, i64* %33, align 8, !tbaa !46 + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %35, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i -declare void @__visc__attributes(i32, ...) local_unnamed_addr #3 +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %35) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit -declare i8* @__visc__tensor_convolution(i8*, i8*, i32, i32, i32, i32) local_unnamed_addr #3 +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %36 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %36) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %0) #2 + br label %if.end -declare void @__visc__return(i32, ...) local_unnamed_addr #3 +if.end: ; preds = %entry, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %call6 = call i32 @fclose(%struct._IO_FILE* %call) + ret void +} ; Function Attrs: nounwind uwtable -define void @_Z10var_1_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z23dumpExecutionAccuraciesv() local_unnamed_addr #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.29, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.end, label %for.cond.preheader + +for.cond.preheader: ; preds = %entry + %0 = load i64, i64* bitcast (float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1) to i64*), align 8, !tbaa !74 + %1 = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !1 + %cmp231 = icmp eq i64 %0, %1 + br i1 %cmp231, label %if.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %for.cond.preheader + %2 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + %3 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %4 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %3, i64 0, i32 0 + %5 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %3, i64 0, i32 0, i32 0 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %6 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + %9 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %10 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i20 = inttoptr i64 %7 to i8* + %vbase.offset.ptr.i.i21 = getelementptr i8, i8* %vtable.cast.i.i20, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr.i.i21 to i64* + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %12 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %13 = bitcast i8** %_M_in_beg.i.i.i to i8* + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %14 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %15 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + %.cast.i.i.i = bitcast %union.anon* %14 to i8* + %16 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + %17 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %18 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %19 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %19 to i8* + %20 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + %21 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i = inttoptr i64 %20 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %23 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %24 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + %25 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %.in = phi i64 [ %1, %for.body.lr.ph ], [ %42, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %26 = inttoptr i64 %.in to float* + %add.ptr.i = getelementptr inbounds float, float* %26, i64 %indvars.iv + %27 = load float, float* %add.ptr.i, align 4, !tbaa !20 + call void @llvm.lifetime.start(i64 376, i8* nonnull %2) #2 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %4) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %5, align 16, !tbaa !46 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 + call void @llvm.memset.p0i8.i64(i8* %6, i8 0, i64 32, i32 8, i1 false) #2 + store i64 %7, i64* %8, align 16, !tbaa !46 + %vbase.offset.i.i22 = load i64, i64* %11, align 8 + %add.ptr.i.i23 = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i.i22 + %28 = bitcast i8* %add.ptr.i.i23 to i64* + store i64 %9, i64* %28, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %10, align 16, !tbaa !46 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %29 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %29, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %2, i64 %vbase.offset5.i.i + %30 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %30, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %5, align 16, !tbaa !46 + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %25, align 16, !tbaa !46 + call void @llvm.memset.p0i8.i64(i8* %13, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %12, align 8, !tbaa !46 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 + store %union.anon* %14, %union.anon** %15, align 8, !tbaa !58 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %10, align 16, !tbaa !46 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %31 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %31, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i + %32 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %32, %"class.std::basic_streambuf"* %16) #2 + %vtable.i24 = load i8*, i8** %10, align 16, !tbaa !46 + %vbase.offset.ptr.i25 = getelementptr i8, i8* %vtable.i24, i64 -24 + %33 = bitcast i8* %vbase.offset.ptr.i25 to i64* + %vbase.offset.i26 = load i64, i64* %33, align 8 + %add.ptr.i27 = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i26 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i27, i64 24 + %34 = bitcast i8* %_M_flags.i.i to i32* + %35 = load i32, i32* %34, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %35, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %34, align 4, !tbaa !60 + %conv.i = fpext float %27 to double + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %17, double %conv.i) #2 + call void @llvm.lifetime.start(i64 32, i8* nonnull %18) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %36 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %37 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call9 = call i64 @fwrite(i8* %36, i64 1, i64 %37, %struct._IO_FILE* nonnull %call) + %fputc = call i32 @fputc(i32 10, %struct._IO_FILE* nonnull %call) + %38 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %cmp.i.i.i = icmp eq i8* %38, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %for.body + call void @_ZdlPv(i8* %38) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %for.body, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %18) #2 + store i64 %20, i64* %8, align 16, !tbaa !46 + %vbase.offset.i.i = load i64, i64* %22, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i.i + %39 = bitcast i8* %add.ptr.i.i to i64* + store i64 %21, i64* %39, align 8, !tbaa !46 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %23, align 8, !tbaa !46 + %40 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %40, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %40) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %23, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* nonnull %24) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %2) #2 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %41 = load i64, i64* bitcast (float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1) to i64*), align 8, !tbaa !74 + %42 = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !1 + %sub.ptr.sub.i = sub i64 %41, %42 + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 + %cmp2 = icmp ult i64 %indvars.iv.next, %sub.ptr.div.i + br i1 %cmp2, label %for.body, label %if.end.loopexit + +if.end.loopexit: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + br label %if.end + +if.end: ; preds = %if.end.loopexit, %for.cond.preheader, %entry + %call11 = call i32 @fclose(%struct._IO_FILE* %call) ret void } -declare i8* @__visc__tensor_add(i8*, i8*) local_unnamed_addr #3 +; Function Attrs: nounwind uwtable +define float @_Z16readPSNRFromFilePKc(i8* nocapture readonly %file_name) local_unnamed_addr #3 { +entry: + %psnr = alloca float, align 4 + %0 = bitcast float* %psnr to i8* + call void @llvm.lifetime.start(i64 4, i8* nonnull %0) #2 + %call = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.30, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %puts = tail call i32 @puts(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @str.79, i64 0, i64 0)) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* nonnull %call, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.32, i64 0, i64 0), float* nonnull %psnr) + %1 = load float, float* %psnr, align 4, !tbaa !20 + %conv = fpext float %1 to double + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.33, i64 0, i64 0), double %conv) + %2 = load float, float* %psnr, align 4, !tbaa !20 + call void @llvm.lifetime.end(i64 4, i8* nonnull %0) #2 + ret float %2 +} + +; Function Attrs: nounwind +declare i32 @fscanf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #1 ; Function Attrs: nounwind uwtable -define void @_Z10var_2_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define float @_Z20computePSNRViolationPvS_f(i8* nocapture readonly %gold_ptr, i8* nocapture readonly %approx_ptr, float %PSNR_threshold) local_unnamed_addr #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + %psnr.i = alloca float, align 4 + %psnr_list.sroa.9 = alloca i64, align 8 + %psnr_list.sroa.13 = alloca float*, align 8 + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %0 = bitcast float* %psnr.i to i8* + call void @llvm.lifetime.start(i64 4, i8* nonnull %0) #2 + %call.i = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.34, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.30, i64 0, i64 0)) #2 + %cmp.i = icmp eq %struct._IO_FILE* %call.i, null + br i1 %cmp.i, label %if.then.i, label %_Z16readPSNRFromFilePKc.exit + +if.then.i: ; preds = %entry + %puts.i = tail call i32 @puts(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @str.79, i64 0, i64 0)) #2 + tail call void @abort() #13 + unreachable + +_Z16readPSNRFromFilePKc.exit: ; preds = %entry + %call2.i = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* nonnull %call.i, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.32, i64 0, i64 0), float* nonnull %psnr.i) #2 + %1 = load float, float* %psnr.i, align 4, !tbaa !20 + %conv.i = fpext float %1 to double + %call3.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.33, i64 0, i64 0), double %conv.i) #2 + %2 = load float, float* %psnr.i, align 4, !tbaa !20 + call void @llvm.lifetime.end(i64 4, i8* nonnull %0) #2 + %psnr_list.sroa.9.0..sroa_cast174 = bitcast i64* %psnr_list.sroa.9 to i8* + call void @llvm.lifetime.start(i64 8, i8* nonnull %psnr_list.sroa.9.0..sroa_cast174) + %psnr_list.sroa.13.0..sroa_cast172 = bitcast float** %psnr_list.sroa.13 to i8* + call void @llvm.lifetime.start(i64 8, i8* nonnull %psnr_list.sroa.13.0..sroa_cast172) + store i64 0, i64* %psnr_list.sroa.9, align 8 + store float* null, float** %psnr_list.sroa.13, align 8 + %dim_sizes1 = getelementptr inbounds i8, i8* %gold_ptr, i64 96 + %3 = bitcast i8* %dim_sizes1 to i64** + %4 = load i64*, i64** %3, align 8, !tbaa !14 + %5 = load i64, i64* %4, align 8, !tbaa !15 + %arrayidx2 = getelementptr inbounds i64, i64* %4, i64 1 + %6 = load i64, i64* %arrayidx2, align 8, !tbaa !15 + %arrayidx3 = getelementptr inbounds i64, i64* %4, i64 2 + %7 = load i64, i64* %arrayidx3, align 8, !tbaa !15 + %mul = mul i64 %7, %6 + %arrayidx4 = getelementptr inbounds i64, i64* %4, i64 3 + %8 = load i64, i64* %arrayidx4, align 8, !tbaa !15 + %mul5 = mul i64 %mul, %8 + %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.35, i64 0, i64 0), i64 %5, i64 %mul5) + %host_data = getelementptr inbounds i8, i8* %gold_ptr, i64 48 + %9 = bitcast i8* %host_data to float** + %10 = load float*, float** %9, align 8, !tbaa !17 + %host_data7 = getelementptr inbounds i8, i8* %approx_ptr, i64 48 + %11 = bitcast i8* %host_data7 to float** + %12 = load float*, float** %11, align 8, !tbaa !17 + %call8 = call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.36, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp187 = icmp eq i64 %5, 0 + br i1 %cmp187, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %_Z16readPSNRFromFilePKc.exit + %cmp11182 = icmp eq i64 %mul5, 0 + %conv = uitofp i64 %mul5 to float + %13 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + %14 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %15 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %14, i64 0, i32 0 + %16 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %14, i64 0, i32 0, i32 0 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %17 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + %18 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %19 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + %20 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %18 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %23 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %24 = bitcast i8** %_M_in_beg.i.i.i to i8* + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %25 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %26 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + %.cast.i.i.i = bitcast %union.anon* %25 to i8* + %27 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + %28 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %29 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %30 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %30 to i8* + %31 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + %32 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i153 = inttoptr i64 %31 to i8* + %vbase.offset.ptr.i.i154 = getelementptr i8, i8* %vtable.cast.i.i153, i64 -24 + %33 = bitcast i8* %vbase.offset.ptr.i.i154 to i64* + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + %psnr_list.sroa.9.0._M_finish.i.sroa_cast = bitcast i64* %psnr_list.sroa.9 to float** + %36 = add i64 %mul5, -8 + %37 = lshr i64 %36, 3 + %38 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + %min.iters.check = icmp ult i64 %mul5, 8 + %n.vec = and i64 %mul5, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + %39 = and i64 %37, 1 + %lcmp.mod246 = icmp eq i64 %39, 0 + %40 = icmp eq i64 %37, 0 + %cmp.n = icmp eq i64 %mul5, %n.vec + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %phitmp = sitofp i32 %num_errors.1 to double + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %_Z16readPSNRFromFilePKc.exit + %psnr_list.sroa.0.0.lcssa = phi i64 [ 0, %_Z16readPSNRFromFilePKc.exit ], [ %psnr_list.sroa.0.1, %for.cond.cleanup.loopexit ] + %num_errors.0.lcssa = phi double [ 0.000000e+00, %_Z16readPSNRFromFilePKc.exit ], [ %phitmp, %for.cond.cleanup.loopexit ] + %sum_psnr.0.lcssa = phi float [ 0.000000e+00, %_Z16readPSNRFromFilePKc.exit ], [ %add28, %for.cond.cleanup.loopexit ] + %conv46 = uitofp i64 %5 to double + %div47 = fdiv fast double %num_errors.0.lcssa, %conv46 + %mul48 = fmul fast double %div47, 1.000000e+02 + %conv49 = fptrunc double %mul48 to float + %conv50 = fpext float %conv49 to double + %call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.38, i64 0, i64 0), double %conv50) + %conv52 = uitofp i64 %5 to float + %div53 = fdiv fast float %sum_psnr.0.lcssa, %conv52 + %conv54 = fpext float %div53 to double + %call55 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.39, i64 0, i64 0), double %conv54) + call void @_Z11dumpAvgPSNRf(float %div53) + %conv58 = fsub fast float 1.000000e+02, %conv49 + call void @_Z17dumpFinalAccuracyf(float %conv58) + %call59 = call i32 @fclose(%struct._IO_FILE* %call8) + br i1 %cmp187, label %for.cond.cleanup63, label %for.body64.lr.ph + +for.body64.lr.ph: ; preds = %for.cond.cleanup + %41 = inttoptr i64 %psnr_list.sroa.0.0.lcssa to float* + %min.iters.check213 = icmp ult i64 %5, 8 + br i1 %min.iters.check213, label %for.body64.preheader, label %min.iters.checked214 + +for.body64.preheader: ; preds = %middle.block211, %min.iters.checked214, %for.body64.lr.ph + %i60.0181.ph = phi i64 [ 0, %min.iters.checked214 ], [ 0, %for.body64.lr.ph ], [ %n.vec216, %middle.block211 ] + %var.0180.ph = phi float [ 0.000000e+00, %min.iters.checked214 ], [ 0.000000e+00, %for.body64.lr.ph ], [ %74, %middle.block211 ] + br label %for.body64 + +min.iters.checked214: ; preds = %for.body64.lr.ph + %n.vec216 = and i64 %5, -8 + %cmp.zero217 = icmp eq i64 %n.vec216, 0 + br i1 %cmp.zero217, label %for.body64.preheader, label %vector.ph218 + +vector.ph218: ; preds = %min.iters.checked214 + %broadcast.splatinsert231 = insertelement <4 x float> undef, float %div53, i32 0 + %broadcast.splat232 = shufflevector <4 x float> %broadcast.splatinsert231, <4 x float> undef, <4 x i32> zeroinitializer + %42 = add i64 %n.vec216, -8 + %43 = lshr exact i64 %42, 3 + %44 = and i64 %43, 1 + %lcmp.mod = icmp eq i64 %44, 0 + br i1 %lcmp.mod, label %vector.body210.prol.preheader, label %vector.body210.prol.loopexit + +vector.body210.prol.preheader: ; preds = %vector.ph218 + br label %vector.body210.prol + +vector.body210.prol: ; preds = %vector.body210.prol.preheader + %45 = inttoptr i64 %psnr_list.sroa.0.0.lcssa to <4 x float>* + %wide.load229.prol = load <4 x float>, <4 x float>* %45, align 4, !tbaa !20 + %46 = getelementptr float, float* %41, i64 4 + %47 = bitcast float* %46 to <4 x float>* + %wide.load230.prol = load <4 x float>, <4 x float>* %47, align 4, !tbaa !20 + %48 = fsub fast <4 x float> %wide.load229.prol, %broadcast.splat232 + %49 = fsub fast <4 x float> %wide.load230.prol, %broadcast.splat232 + %50 = fmul fast <4 x float> %48, %48 + %51 = fmul fast <4 x float> %49, %49 + br label %vector.body210.prol.loopexit + +vector.body210.prol.loopexit: ; preds = %vector.body210.prol, %vector.ph218 + %.lcssa240.unr = phi <4 x float> [ undef, %vector.ph218 ], [ %50, %vector.body210.prol ] + %.lcssa.unr = phi <4 x float> [ undef, %vector.ph218 ], [ %51, %vector.body210.prol ] + %index219.unr = phi i64 [ 0, %vector.ph218 ], [ 8, %vector.body210.prol ] + %vec.phi227.unr = phi <4 x float> [ zeroinitializer, %vector.ph218 ], [ %50, %vector.body210.prol ] + %vec.phi228.unr = phi <4 x float> [ zeroinitializer, %vector.ph218 ], [ %51, %vector.body210.prol ] + %52 = icmp eq i64 %43, 0 + br i1 %52, label %middle.block211, label %vector.ph218.new + +vector.ph218.new: ; preds = %vector.body210.prol.loopexit + br label %vector.body210 + +vector.body210: ; preds = %vector.body210, %vector.ph218.new + %index219 = phi i64 [ %index219.unr, %vector.ph218.new ], [ %index.next220.1, %vector.body210 ] + %vec.phi227 = phi <4 x float> [ %vec.phi227.unr, %vector.ph218.new ], [ %71, %vector.body210 ] + %vec.phi228 = phi <4 x float> [ %vec.phi228.unr, %vector.ph218.new ], [ %72, %vector.body210 ] + %53 = getelementptr inbounds float, float* %41, i64 %index219 + %54 = bitcast float* %53 to <4 x float>* + %wide.load229 = load <4 x float>, <4 x float>* %54, align 4, !tbaa !20 + %55 = getelementptr float, float* %53, i64 4 + %56 = bitcast float* %55 to <4 x float>* + %wide.load230 = load <4 x float>, <4 x float>* %56, align 4, !tbaa !20 + %57 = fsub fast <4 x float> %wide.load229, %broadcast.splat232 + %58 = fsub fast <4 x float> %wide.load230, %broadcast.splat232 + %59 = fmul fast <4 x float> %57, %57 + %60 = fmul fast <4 x float> %58, %58 + %61 = fadd fast <4 x float> %59, %vec.phi227 + %62 = fadd fast <4 x float> %60, %vec.phi228 + %index.next220 = add i64 %index219, 8 + %63 = getelementptr inbounds float, float* %41, i64 %index.next220 + %64 = bitcast float* %63 to <4 x float>* + %wide.load229.1 = load <4 x float>, <4 x float>* %64, align 4, !tbaa !20 + %65 = getelementptr float, float* %63, i64 4 + %66 = bitcast float* %65 to <4 x float>* + %wide.load230.1 = load <4 x float>, <4 x float>* %66, align 4, !tbaa !20 + %67 = fsub fast <4 x float> %wide.load229.1, %broadcast.splat232 + %68 = fsub fast <4 x float> %wide.load230.1, %broadcast.splat232 + %69 = fmul fast <4 x float> %67, %67 + %70 = fmul fast <4 x float> %68, %68 + %71 = fadd fast <4 x float> %69, %61 + %72 = fadd fast <4 x float> %70, %62 + %index.next220.1 = add i64 %index219, 16 + %73 = icmp eq i64 %index.next220.1, %n.vec216 + br i1 %73, label %middle.block211.unr-lcssa, label %vector.body210, !llvm.loop !76 + +middle.block211.unr-lcssa: ; preds = %vector.body210 + br label %middle.block211 + +middle.block211: ; preds = %vector.body210.prol.loopexit, %middle.block211.unr-lcssa + %.lcssa240 = phi <4 x float> [ %.lcssa240.unr, %vector.body210.prol.loopexit ], [ %71, %middle.block211.unr-lcssa ] + %.lcssa = phi <4 x float> [ %.lcssa.unr, %vector.body210.prol.loopexit ], [ %72, %middle.block211.unr-lcssa ] + %bin.rdx233 = fadd fast <4 x float> %.lcssa, %.lcssa240 + %rdx.shuf234 = shufflevector <4 x float> %bin.rdx233, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx235 = fadd fast <4 x float> %bin.rdx233, %rdx.shuf234 + %rdx.shuf236 = shufflevector <4 x float> %bin.rdx235, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx237 = fadd fast <4 x float> %bin.rdx235, %rdx.shuf236 + %74 = extractelement <4 x float> %bin.rdx237, i32 0 + %cmp.n222 = icmp eq i64 %5, %n.vec216 + br i1 %cmp.n222, label %for.cond.cleanup63, label %for.body64.preheader + +for.body: ; preds = %for.body.lr.ph, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %sum_psnr.0191 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add28, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %num_errors.0190 = phi i32 [ 0, %for.body.lr.ph ], [ %num_errors.1, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %i.0189 = phi i64 [ 0, %for.body.lr.ph ], [ %inc42, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %psnr_list.sroa.0.0188 = phi i64 [ 0, %for.body.lr.ph ], [ %psnr_list.sroa.0.1, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %mul9 = mul i64 %i.0189, %mul5 + br i1 %cmp11182, label %for.cond.cleanup12, label %for.body13.preheader + +for.body13.preheader: ; preds = %for.body + br i1 %min.iters.check, label %for.body13.preheader239, label %min.iters.checked + +for.body13.preheader239: ; preds = %middle.block, %min.iters.checked, %for.body13.preheader + %mse_sum.0185.ph = phi float [ 0.000000e+00, %min.iters.checked ], [ 0.000000e+00, %for.body13.preheader ], [ %118, %middle.block ] + %j.0184.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.body13.preheader ], [ %n.vec, %middle.block ] + br label %for.body13 + +min.iters.checked: ; preds = %for.body13.preheader + br i1 %cmp.zero, label %for.body13.preheader239, label %vector.body.preheader + +vector.body.preheader: ; preds = %min.iters.checked + br i1 %lcmp.mod246, label %vector.body.prol.preheader, label %vector.body.prol.loopexit.unr-lcssa + +vector.body.prol.preheader: ; preds = %vector.body.preheader + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol.preheader + %75 = getelementptr inbounds float, float* %10, i64 %mul9 + %76 = bitcast float* %75 to <4 x float>* + %wide.load.prol = load <4 x float>, <4 x float>* %76, align 4, !tbaa !20 + %77 = getelementptr float, float* %75, i64 4 + %78 = bitcast float* %77 to <4 x float>* + %wide.load204.prol = load <4 x float>, <4 x float>* %78, align 4, !tbaa !20 + %79 = getelementptr inbounds float, float* %12, i64 %mul9 + %80 = bitcast float* %79 to <4 x float>* + %wide.load205.prol = load <4 x float>, <4 x float>* %80, align 4, !tbaa !20 + %81 = getelementptr float, float* %79, i64 4 + %82 = bitcast float* %81 to <4 x float>* + %wide.load206.prol = load <4 x float>, <4 x float>* %82, align 4, !tbaa !20 + %83 = fsub fast <4 x float> %wide.load.prol, %wide.load205.prol + %84 = fsub fast <4 x float> %wide.load204.prol, %wide.load206.prol + %85 = fmul fast <4 x float> %83, %83 + %86 = fmul fast <4 x float> %84, %84 + br label %vector.body.prol.loopexit.unr-lcssa + +vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.preheader, %vector.body.prol + %.lcssa242.unr.ph = phi <4 x float> [ %85, %vector.body.prol ], [ undef, %vector.body.preheader ] + %.lcssa241.unr.ph = phi <4 x float> [ %86, %vector.body.prol ], [ undef, %vector.body.preheader ] + %index.unr.ph = phi i64 [ 8, %vector.body.prol ], [ 0, %vector.body.preheader ] + %vec.phi.unr.ph = phi <4 x float> [ %85, %vector.body.prol ], [ zeroinitializer, %vector.body.preheader ] + %vec.phi202.unr.ph = phi <4 x float> [ %86, %vector.body.prol ], [ zeroinitializer, %vector.body.preheader ] + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.body.prol.loopexit.unr-lcssa + br i1 %40, label %middle.block, label %vector.body.preheader.new + +vector.body.preheader.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.body.preheader.new + %index = phi i64 [ %index.unr.ph, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] + %vec.phi = phi <4 x float> [ %vec.phi.unr.ph, %vector.body.preheader.new ], [ %115, %vector.body ] + %vec.phi202 = phi <4 x float> [ %vec.phi202.unr.ph, %vector.body.preheader.new ], [ %116, %vector.body ] + %87 = add i64 %index, %mul9 + %88 = getelementptr inbounds float, float* %10, i64 %87 + %89 = bitcast float* %88 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %89, align 4, !tbaa !20 + %90 = getelementptr float, float* %88, i64 4 + %91 = bitcast float* %90 to <4 x float>* + %wide.load204 = load <4 x float>, <4 x float>* %91, align 4, !tbaa !20 + %92 = getelementptr inbounds float, float* %12, i64 %87 + %93 = bitcast float* %92 to <4 x float>* + %wide.load205 = load <4 x float>, <4 x float>* %93, align 4, !tbaa !20 + %94 = getelementptr float, float* %92, i64 4 + %95 = bitcast float* %94 to <4 x float>* + %wide.load206 = load <4 x float>, <4 x float>* %95, align 4, !tbaa !20 + %96 = fsub fast <4 x float> %wide.load, %wide.load205 + %97 = fsub fast <4 x float> %wide.load204, %wide.load206 + %98 = fmul fast <4 x float> %96, %96 + %99 = fmul fast <4 x float> %97, %97 + %100 = fadd fast <4 x float> %98, %vec.phi + %101 = fadd fast <4 x float> %99, %vec.phi202 + %index.next = add i64 %index, 8 + %102 = add i64 %index.next, %mul9 + %103 = getelementptr inbounds float, float* %10, i64 %102 + %104 = bitcast float* %103 to <4 x float>* + %wide.load.1 = load <4 x float>, <4 x float>* %104, align 4, !tbaa !20 + %105 = getelementptr float, float* %103, i64 4 + %106 = bitcast float* %105 to <4 x float>* + %wide.load204.1 = load <4 x float>, <4 x float>* %106, align 4, !tbaa !20 + %107 = getelementptr inbounds float, float* %12, i64 %102 + %108 = bitcast float* %107 to <4 x float>* + %wide.load205.1 = load <4 x float>, <4 x float>* %108, align 4, !tbaa !20 + %109 = getelementptr float, float* %107, i64 4 + %110 = bitcast float* %109 to <4 x float>* + %wide.load206.1 = load <4 x float>, <4 x float>* %110, align 4, !tbaa !20 + %111 = fsub fast <4 x float> %wide.load.1, %wide.load205.1 + %112 = fsub fast <4 x float> %wide.load204.1, %wide.load206.1 + %113 = fmul fast <4 x float> %111, %111 + %114 = fmul fast <4 x float> %112, %112 + %115 = fadd fast <4 x float> %113, %100 + %116 = fadd fast <4 x float> %114, %101 + %index.next.1 = add i64 %index, 16 + %117 = icmp eq i64 %index.next.1, %n.vec + br i1 %117, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !77 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %vector.body.prol.loopexit, %middle.block.unr-lcssa + %.lcssa242 = phi <4 x float> [ %.lcssa242.unr.ph, %vector.body.prol.loopexit ], [ %115, %middle.block.unr-lcssa ] + %.lcssa241 = phi <4 x float> [ %.lcssa241.unr.ph, %vector.body.prol.loopexit ], [ %116, %middle.block.unr-lcssa ] + %bin.rdx = fadd fast <4 x float> %.lcssa241, %.lcssa242 + %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx207 = fadd fast <4 x float> %bin.rdx, %rdx.shuf + %rdx.shuf208 = shufflevector <4 x float> %bin.rdx207, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx209 = fadd fast <4 x float> %bin.rdx207, %rdx.shuf208 + %118 = extractelement <4 x float> %bin.rdx209, i32 0 + br i1 %cmp.n, label %for.cond.cleanup12, label %for.body13.preheader239 + +for.cond.cleanup12.loopexit: ; preds = %for.body13 + br label %for.cond.cleanup12 + +for.cond.cleanup12: ; preds = %for.cond.cleanup12.loopexit, %middle.block, %for.body + %mse_sum.0.lcssa = phi float [ 0.000000e+00, %for.body ], [ %118, %middle.block ], [ %add18, %for.cond.cleanup12.loopexit ] + %div = fdiv fast float %mse_sum.0.lcssa, %conv + %call.i141 = call fast float @sqrtf(float %div) #12 + %div25 = fdiv fast float 2.550000e+02, %call.i141 + %call.i142 = call fast float @log10f(float %div25) #12 + %mul27 = fmul fast float %call.i142, 2.000000e+01 + %add28 = fadd fast float %mul27, %sum_psnr.0191 + %cmp29 = fcmp fast olt float %mul27, %2 + %add31 = zext i1 %cmp29 to i32 + %num_errors.1 = add nsw i32 %add31, %num_errors.0190 + %conv33 = fpext float %mul27 to double + %call34 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.37, i64 0, i64 0), double %conv33) + %psnr_list.sroa.9.0.psnr_list.sroa.9.8. = load i64, i64* %psnr_list.sroa.9, align 8 + %119 = inttoptr i64 %psnr_list.sroa.9.0.psnr_list.sroa.9.8. to float* + %psnr_list.sroa.13.0.psnr_list.sroa.13.16. = load float*, float** %psnr_list.sroa.13, align 8 + %cmp.i143 = icmp eq float* %119, %psnr_list.sroa.13.0.psnr_list.sroa.13.16. + br i1 %cmp.i143, label %if.else.i, label %if.then.i144 + +if.then.i144: ; preds = %for.cond.cleanup12 + store float %mul27, float* %119, align 4, !tbaa !20 + %incdec.ptr.i = getelementptr inbounds float, float* %119, i64 1 + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +if.else.i: ; preds = %for.cond.cleanup12 + %sub.ptr.sub.i21.i.i.i = sub i64 %psnr_list.sroa.9.0.psnr_list.sroa.9.8., %psnr_list.sroa.0.0188 + %sub.ptr.div.i22.i.i.i = ashr exact i64 %sub.ptr.sub.i21.i.i.i, 2 + %cmp.i.i.i.i = icmp eq i64 %sub.ptr.div.i22.i.i.i, 0 + %.sroa.speculated.i.i.i = select i1 %cmp.i.i.i.i, i64 1, i64 %sub.ptr.div.i22.i.i.i + %add.i.i.i = add nsw i64 %.sroa.speculated.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp7.i.i.i = icmp ult i64 %add.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp9.i.i.i = icmp ugt i64 %add.i.i.i, 4611686018427387903 + %or.cond.i.i.i = or i1 %cmp7.i.i.i, %cmp9.i.i.i + %cond.i.i.i = select i1 %or.cond.i.i.i, i64 4611686018427387903, i64 %add.i.i.i + %cmp.i35.i.i = icmp eq i64 %cond.i.i.i, 0 + br i1 %cmp.i35.i.i, label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i, label %cond.true.i.i.i + +cond.true.i.i.i: ; preds = %if.else.i + %cmp.i.i.i.i.i = icmp ugt i64 %cond.i.i.i, 4611686018427387903 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i + +if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i + call void @_ZSt17__throw_bad_allocv() #13 + unreachable + +_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i: ; preds = %cond.true.i.i.i + %mul.i.i.i.i.i = shl i64 %cond.i.i.i, 2 + %call2.i.i.i.i.i = call i8* @_Znwm(i64 %mul.i.i.i.i.i) #2 + %120 = bitcast i8* %call2.i.i.i.i.i to float* + br label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i: ; preds = %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i, %if.else.i + %121 = phi i8* [ %call2.i.i.i.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %cond.i36.i.i = phi float* [ %120, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %add.ptr.i.i = getelementptr inbounds float, float* %cond.i36.i.i, i64 %sub.ptr.div.i22.i.i.i + store float %mul27, float* %add.ptr.i.i, align 4, !tbaa !20 + br i1 %cmp.i.i.i.i, label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i + +if.then.i.i.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %122 = inttoptr i64 %psnr_list.sroa.0.0188 to i8* + call void @llvm.memmove.p0i8.p0i8.i64(i8* %121, i8* %122, i64 %sub.ptr.sub.i21.i.i.i, i32 4, i1 false) #2 + br label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + +_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i: ; preds = %if.then.i.i.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %incdec.ptr.i.i = getelementptr inbounds float, float* %add.ptr.i.i, i64 1 + %tobool.i.i.i = icmp eq i64 %psnr_list.sroa.0.0188, 0 + br i1 %tobool.i.i.i, label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i, label %if.then.i37.i.i + +if.then.i37.i.i: ; preds = %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + %123 = inttoptr i64 %psnr_list.sroa.0.0188 to i8* + call void @_ZdlPv(i8* %123) #2 + br label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i + +_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i: ; preds = %if.then.i37.i.i, %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + %124 = ptrtoint i8* %121 to i64 + %125 = ptrtoint float* %incdec.ptr.i.i to i64 + store i64 %125, i64* %psnr_list.sroa.9, align 8 + %add.ptr23.i.i = getelementptr inbounds float, float* %cond.i36.i.i, i64 %cond.i.i.i + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +_ZNSt6vectorIfSaIfEE9push_backERKf.exit: ; preds = %if.then.i144, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i + %psnr_list.sroa.0.1 = phi i64 [ %124, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %psnr_list.sroa.0.0188, %if.then.i144 ] + %_M_end_of_storage.sink.i = phi float** [ %psnr_list.sroa.13, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %psnr_list.sroa.9.0._M_finish.i.sroa_cast, %if.then.i144 ] + %add.ptr23.i.sink.i = phi float* [ %add.ptr23.i.i, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %incdec.ptr.i, %if.then.i144 ] + store float* %add.ptr23.i.sink.i, float** %_M_end_of_storage.sink.i, align 8, !tbaa !73 + call void @llvm.lifetime.start(i64 376, i8* nonnull %13) #2 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %15) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %16, align 16, !tbaa !46 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !48 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !51 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !52 + call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 32, i32 8, i1 false) #2 + store i64 %18, i64* %19, align 16, !tbaa !46 + %vbase.offset.i.i = load i64, i64* %22, align 8 + %add.ptr.i.i145 = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i.i + %126 = bitcast i8* %add.ptr.i.i145 to i64* + store i64 %20, i64* %126, align 8, !tbaa !46 + %vtable3.i.i = load i8*, i8** %21, align 16, !tbaa !46 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %127 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %127, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %13, i64 %vbase.offset5.i.i + %128 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %128, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %16, align 16, !tbaa !46 + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %38, align 16, !tbaa !46 + call void @llvm.memset.p0i8.i64(i8* %24, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %23, align 8, !tbaa !46 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !53 + store %union.anon* %25, %union.anon** %26, align 8, !tbaa !58 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !59 + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !42 + %vtable.i = load i8*, i8** %21, align 16, !tbaa !46 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %129 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %129, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i + %130 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %130, %"class.std::basic_streambuf"* %27) #2 + %vtable.i146 = load i8*, i8** %21, align 16, !tbaa !46 + %vbase.offset.ptr.i147 = getelementptr i8, i8* %vtable.i146, i64 -24 + %131 = bitcast i8* %vbase.offset.ptr.i147 to i64* + %vbase.offset.i148 = load i64, i64* %131, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i148 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %132 = bitcast i8* %_M_flags.i.i to i32* + %133 = load i32, i32* %132, align 8, !tbaa !65 + %and.i.i.i.i = and i32 %133, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %132, align 4, !tbaa !60 + %call.i151 = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %28, double %conv33) #2 + call void @llvm.lifetime.start(i64 32, i8* nonnull %29) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %134 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %135 = load i64, i64* %_M_string_length.i, align 8, !tbaa !59 + %call39 = call i64 @fwrite(i8* %134, i64 1, i64 %135, %struct._IO_FILE* %call8) + %fputc = call i32 @fputc(i32 10, %struct._IO_FILE* %call8) + %136 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %cmp.i.i.i = icmp eq i8* %136, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + call void @_ZdlPv(i8* %136) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %_ZNSt6vectorIfSaIfEE9push_backERKf.exit, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %29) #2 + store i64 %31, i64* %19, align 16, !tbaa !46 + %vbase.offset.i.i155 = load i64, i64* %33, align 8 + %add.ptr.i.i156 = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i.i155 + %137 = bitcast i8* %add.ptr.i.i156 to i64* + store i64 %32, i64* %137, align 8, !tbaa !46 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + %138 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i.i.i.i = icmp eq i8* %138, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i157 + +if.then.i.i.i.i.i157: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %138) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, %if.then.i.i.i.i.i157 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !46 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* nonnull %35) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %13) #2 + %inc42 = add nuw i64 %i.0189, 1 + %cmp = icmp ult i64 %inc42, %5 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit + +for.body13: ; preds = %for.body13.preheader239, %for.body13 + %mse_sum.0185 = phi float [ %add18, %for.body13 ], [ %mse_sum.0185.ph, %for.body13.preheader239 ] + %j.0184 = phi i64 [ %inc, %for.body13 ], [ %j.0184.ph, %for.body13.preheader239 ] + %add = add i64 %j.0184, %mul9 + %arrayidx14 = getelementptr inbounds float, float* %10, i64 %add + %139 = load float, float* %arrayidx14, align 4, !tbaa !20 + %arrayidx16 = getelementptr inbounds float, float* %12, i64 %add + %140 = load float, float* %arrayidx16, align 4, !tbaa !20 + %sub = fsub fast float %139, %140 + %mul17 = fmul fast float %sub, %sub + %add18 = fadd fast float %mul17, %mse_sum.0185 + %inc = add nuw i64 %j.0184, 1 + %exitcond197 = icmp eq i64 %inc, %mul5 + br i1 %exitcond197, label %for.cond.cleanup12.loopexit, label %for.body13, !llvm.loop !78 + +for.cond.cleanup63.loopexit: ; preds = %for.body64 + br label %for.cond.cleanup63 + +for.cond.cleanup63: ; preds = %for.cond.cleanup63.loopexit, %middle.block211, %for.cond.cleanup + %var.0.lcssa = phi float [ 0.000000e+00, %for.cond.cleanup ], [ %74, %middle.block211 ], [ %add70, %for.cond.cleanup63.loopexit ] + %div75 = fdiv fast float %var.0.lcssa, %conv52 + %call.i158 = call fast float @sqrtf(float %div75) #12 + call void @_Z11dumpPSNRStdf(float %call.i158) + %tobool.i.i.i159 = icmp eq i64 %psnr_list.sroa.0.0.lcssa, 0 + br i1 %tobool.i.i.i159, label %_ZNSt6vectorIfSaIfEED2Ev.exit, label %if.then.i.i.i + +if.then.i.i.i: ; preds = %for.cond.cleanup63 + %141 = inttoptr i64 %psnr_list.sroa.0.0.lcssa to i8* + call void @_ZdlPv(i8* %141) #2 + br label %_ZNSt6vectorIfSaIfEED2Ev.exit + +_ZNSt6vectorIfSaIfEED2Ev.exit: ; preds = %for.cond.cleanup63, %if.then.i.i.i + call void @llvm.lifetime.end(i64 8, i8* nonnull %psnr_list.sroa.9.0..sroa_cast174) + call void @llvm.lifetime.end(i64 8, i8* nonnull %psnr_list.sroa.13.0..sroa_cast172) + ret float %conv49 + +for.body64: ; preds = %for.body64.preheader, %for.body64 + %i60.0181 = phi i64 [ %inc72, %for.body64 ], [ %i60.0181.ph, %for.body64.preheader ] + %var.0180 = phi float [ %add70, %for.body64 ], [ %var.0180.ph, %for.body64.preheader ] + %add.ptr.i160 = getelementptr inbounds float, float* %41, i64 %i60.0181 + %142 = load float, float* %add.ptr.i160, align 4, !tbaa !20 + %sub66 = fsub fast float %142, %div53 + %mul69 = fmul fast float %sub66, %sub66 + %add70 = fadd fast float %mul69, %var.0180 + %inc72 = add nuw i64 %i60.0181, 1 + %exitcond = icmp eq i64 %inc72, %5 + br i1 %exitcond, label %for.cond.cleanup63.loopexit, label %for.body64, !llvm.loop !79 +} + +; Function Attrs: nounwind uwtable +define void @_Z10dumpOutputPvPKc(i8* nocapture readonly %output_ptr, i8* nocapture readonly %file_name) local_unnamed_addr #3 { +entry: + %size_in_bytes1 = getelementptr inbounds i8, i8* %output_ptr, i64 80 + %0 = bitcast i8* %size_in_bytes1 to i64* + %1 = load i64, i64* %0, align 8, !tbaa !18 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.40, i64 0, i64 0), i64 %1) + %host_data2 = getelementptr inbounds i8, i8* %output_ptr, i64 48 + %2 = bitcast i8* %host_data2 to i8** + %3 = load i8*, i8** %2, align 8, !tbaa !17 + %call3 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %call4 = tail call i64 @fwrite(i8* %3, i64 1, i64 %1, %struct._IO_FILE* %call3) + %call5 = tail call i32 @fclose(%struct._IO_FILE* %call3) + ret void +} + +; Function Attrs: nounwind uwtable +define void @_Z10var_0_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { +entry: + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 + ret void +} + +declare void @__visc__hint(i32) local_unnamed_addr #0 + +declare void @__visc__attributes(i32, ...) local_unnamed_addr #0 + +declare i8* @__visc__tensor_convolution(i8*, i8*, i32, i32, i32, i32) local_unnamed_addr #0 + +declare void @__visc__return(i32, ...) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +define void @_Z10var_1_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { +entry: + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 + ret void +} + +declare i8* @__visc__tensor_add(i8*, i8*) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +define void @_Z10var_2_nodePvm(i8* %t1, i64 %bytes_t1) #3 { +entry: + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } -declare i8* @__visc__tensor_relu(i8*) local_unnamed_addr #3 +declare i8* @__visc__tensor_relu(i8*) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -define void @_Z10var_3_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z10var_3_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z10var_4_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z10var_4_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z10var_5_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z10var_5_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z10var_6_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z10var_6_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } -declare i8* @__visc__tensor_pool_max(i8*, i32, i32, i32, i32, i32, i32) local_unnamed_addr #3 +declare i8* @__visc__tensor_pool_max(i8*, i32, i32, i32, i32, i32, i32) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -define void @_Z10var_7_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z10var_7_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z10var_8_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z10var_8_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z10var_9_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z10var_9_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_10_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_10_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_11_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_11_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_12_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_12_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_13_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_13_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_14_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_14_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_15_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_15_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_16_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_16_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_17_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_17_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_18_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_18_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_19_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_19_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_20_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_20_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_21_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_21_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_22_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_22_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_23_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_23_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_24_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_24_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_25_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_25_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_26_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_26_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_27_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_27_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_28_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_28_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_29_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_29_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_30_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_30_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_31_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_31_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_32_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_32_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_33_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_33_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_34_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_34_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_35_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_35_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_36_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_36_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_37_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_37_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_38_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_38_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_39_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_39_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_40_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_40_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_41_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_41_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_42_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_42_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_43_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_43_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_pool_max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_44_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_44_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_mul(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_mul(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } -declare i8* @__visc__tensor_mul(i8*, i8*) local_unnamed_addr #3 +declare i8* @__visc__tensor_mul(i8*, i8*) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -define void @_Z11var_45_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_45_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_46_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_46_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_relu(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_relu(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_47_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_47_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_mul(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_mul(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_48_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #0 { +define void @_Z11var_48_nodePvmS_m(i8* %t1, i64 %bytes_t1, i8* %t2, i64 %bytes_t2) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #7 - %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 2, i8* %t1, i8* %t2, i32 0) #2 + %call = tail call i8* @__visc__tensor_add(i8* %t1, i8* %t2) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z11var_49_nodePvm(i8* %t1, i64 %bytes_t1) #0 { +define void @_Z11var_49_nodePvm(i8* %t1, i64 %bytes_t1) #3 { entry: - tail call void @__visc__hint(i32 4) #7 - tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #7 - %call = tail call i8* @__visc__tensor_softmax(i8* %t1) #7 - tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #7 + tail call void @__visc__hint(i32 4) #2 + tail call void (i32, ...) @__visc__attributes(i32 1, i8* %t1, i32 0) #2 + %call = tail call i8* @__visc__tensor_softmax(i8* %t1) #2 + tail call void (i32, ...) @__visc__return(i32 2, i8* %call, i64 0) #2 ret void } -declare i8* @__visc__tensor_softmax(i8*) local_unnamed_addr #3 +declare i8* @__visc__tensor_softmax(i8*) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -define void @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m(i8* %input, i64 %input_bytes, i8* %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* %conv2d_1_b, i64 %conv2d_1_b_bytes, i8* %conv2d_2_w, i64 %conv2d_2_w_bytes, i8* %conv2d_2_b, i64 %conv2d_2_b_bytes, i8* %conv2d_3_w, i64 %conv2d_3_w_bytes, i8* %conv2d_3_b, i64 %conv2d_3_b_bytes, i8* %conv2d_4_w, i64 %conv2d_4_w_bytes, i8* %conv2d_4_b, i64 %conv2d_4_b_bytes, i8* %conv2d_5_w, i64 %conv2d_5_w_bytes, i8* %conv2d_5_b, i64 %conv2d_5_b_bytes, i8* %conv2d_6_w, i64 %conv2d_6_w_bytes, i8* %conv2d_6_b, i64 %conv2d_6_b_bytes, i8* %conv2d_7_w, i64 %conv2d_7_w_bytes, i8* %conv2d_7_b, i64 %conv2d_7_b_bytes, i8* %conv2d_8_w, i64 %conv2d_8_w_bytes, i8* %conv2d_8_b, i64 %conv2d_8_b_bytes, i8* %conv2d_9_w, i64 %conv2d_9_w_bytes, i8* %conv2d_9_b, i64 %conv2d_9_b_bytes, i8* %conv2d_10_w, i64 %conv2d_10_w_bytes, i8* %conv2d_10_b, i64 %conv2d_10_b_bytes, i8* %conv2d_11_w, i64 %conv2d_11_w_bytes, i8* %conv2d_11_b, i64 %conv2d_11_b_bytes, i8* %conv2d_12_w, i64 %conv2d_12_w_bytes, i8* %conv2d_12_b, i64 %conv2d_12_b_bytes, i8* %conv2d_13_w, i64 %conv2d_13_w_bytes, i8* %conv2d_13_b, i64 %conv2d_13_b_bytes, i8* %dense_1_w, i64 %dense_1_w_bytes, i8* %dense_1_b, i64 %dense_1_b_bytes, i8* %dense_2_w, i64 %dense_2_w_bytes, i8* %dense_2_b, i64 %dense_2_b_bytes) #0 { +define void @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m(i8* %input, i64 %input_bytes, i8* %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* %conv2d_1_b, i64 %conv2d_1_b_bytes, i8* %conv2d_2_w, i64 %conv2d_2_w_bytes, i8* %conv2d_2_b, i64 %conv2d_2_b_bytes, i8* %conv2d_3_w, i64 %conv2d_3_w_bytes, i8* %conv2d_3_b, i64 %conv2d_3_b_bytes, i8* %conv2d_4_w, i64 %conv2d_4_w_bytes, i8* %conv2d_4_b, i64 %conv2d_4_b_bytes, i8* %conv2d_5_w, i64 %conv2d_5_w_bytes, i8* %conv2d_5_b, i64 %conv2d_5_b_bytes, i8* %conv2d_6_w, i64 %conv2d_6_w_bytes, i8* %conv2d_6_b, i64 %conv2d_6_b_bytes, i8* %conv2d_7_w, i64 %conv2d_7_w_bytes, i8* %conv2d_7_b, i64 %conv2d_7_b_bytes, i8* %conv2d_8_w, i64 %conv2d_8_w_bytes, i8* %conv2d_8_b, i64 %conv2d_8_b_bytes, i8* %conv2d_9_w, i64 %conv2d_9_w_bytes, i8* %conv2d_9_b, i64 %conv2d_9_b_bytes, i8* %conv2d_10_w, i64 %conv2d_10_w_bytes, i8* %conv2d_10_b, i64 %conv2d_10_b_bytes, i8* %conv2d_11_w, i64 %conv2d_11_w_bytes, i8* %conv2d_11_b, i64 %conv2d_11_b_bytes, i8* %conv2d_12_w, i64 %conv2d_12_w_bytes, i8* %conv2d_12_b, i64 %conv2d_12_b_bytes, i8* %conv2d_13_w, i64 %conv2d_13_w_bytes, i8* %conv2d_13_b, i64 %conv2d_13_b_bytes, i8* %dense_1_w, i64 %dense_1_w_bytes, i8* %dense_1_b, i64 %dense_1_b_bytes, i8* %dense_2_w, i64 %dense_2_w_bytes, i8* %dense_2_b, i64 %dense_2_b_bytes) #3 { entry: - tail call void @__visc__hint(i32 1) #7 - tail call void (i32, ...) @__visc__attributes(i32 31, i8* %input, i8* %conv2d_1_w, i8* %conv2d_1_b, i8* %conv2d_2_w, i8* %conv2d_2_b, i8* %conv2d_3_w, i8* %conv2d_3_b, i8* %conv2d_4_w, i8* %conv2d_4_b, i8* %conv2d_5_w, i8* %conv2d_5_b, i8* %conv2d_6_w, i8* %conv2d_6_b, i8* %conv2d_7_w, i8* %conv2d_7_b, i8* %conv2d_8_w, i8* %conv2d_8_b, i8* %conv2d_9_w, i8* %conv2d_9_b, i8* %conv2d_10_w, i8* %conv2d_10_b, i8* %conv2d_11_w, i8* %conv2d_11_b, i8* %conv2d_12_w, i8* %conv2d_12_b, i8* %conv2d_13_w, i8* %conv2d_13_b, i8* %dense_1_w, i8* %dense_1_b, i8* %dense_2_w, i8* %dense_2_b, i32 0) #7 - %call = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_0_nodePvmS_m) #7 - tail call void @__visc__bindIn(i8* %call, i32 0, i32 0, i32 0) #7 - tail call void @__visc__bindIn(i8* %call, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call, i32 2, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call, i32 3, i32 3, i32 0) #7 - %call1 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_1_nodePvmS_m) #7 - %call2 = tail call i8* @__visc__edge(i8* %call, i8* %call1, i32 1, i32 0, i32 0, i32 0) #7 - %call3 = tail call i8* @__visc__edge(i8* %call, i8* %call1, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call1, i32 4, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call1, i32 5, i32 3, i32 0) #7 - %call4 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_2_nodePvm) #7 - %call5 = tail call i8* @__visc__edge(i8* %call1, i8* %call4, i32 1, i32 0, i32 0, i32 0) #7 - %call6 = tail call i8* @__visc__edge(i8* %call1, i8* %call4, i32 1, i32 1, i32 1, i32 0) #7 - %call7 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_3_nodePvmS_m) #7 - %call8 = tail call i8* @__visc__edge(i8* %call4, i8* %call7, i32 1, i32 0, i32 0, i32 0) #7 - %call9 = tail call i8* @__visc__edge(i8* %call4, i8* %call7, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call7, i32 6, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call7, i32 7, i32 3, i32 0) #7 - %call10 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_4_nodePvmS_m) #7 - %call11 = tail call i8* @__visc__edge(i8* %call7, i8* %call10, i32 1, i32 0, i32 0, i32 0) #7 - %call12 = tail call i8* @__visc__edge(i8* %call7, i8* %call10, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call10, i32 8, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call10, i32 9, i32 3, i32 0) #7 - %call13 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_5_nodePvm) #7 - %call14 = tail call i8* @__visc__edge(i8* %call10, i8* %call13, i32 1, i32 0, i32 0, i32 0) #7 - %call15 = tail call i8* @__visc__edge(i8* %call10, i8* %call13, i32 1, i32 1, i32 1, i32 0) #7 - %call16 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_6_nodePvm) #7 - %call17 = tail call i8* @__visc__edge(i8* %call13, i8* %call16, i32 1, i32 0, i32 0, i32 0) #7 - %call18 = tail call i8* @__visc__edge(i8* %call13, i8* %call16, i32 1, i32 1, i32 1, i32 0) #7 - %call19 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_7_nodePvmS_m) #7 - %call20 = tail call i8* @__visc__edge(i8* %call16, i8* %call19, i32 1, i32 0, i32 0, i32 0) #7 - %call21 = tail call i8* @__visc__edge(i8* %call16, i8* %call19, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call19, i32 10, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call19, i32 11, i32 3, i32 0) #7 - %call22 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_8_nodePvmS_m) #7 - %call23 = tail call i8* @__visc__edge(i8* %call19, i8* %call22, i32 1, i32 0, i32 0, i32 0) #7 - %call24 = tail call i8* @__visc__edge(i8* %call19, i8* %call22, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call22, i32 12, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call22, i32 13, i32 3, i32 0) #7 - %call25 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_9_nodePvm) #7 - %call26 = tail call i8* @__visc__edge(i8* %call22, i8* %call25, i32 1, i32 0, i32 0, i32 0) #7 - %call27 = tail call i8* @__visc__edge(i8* %call22, i8* %call25, i32 1, i32 1, i32 1, i32 0) #7 - %call28 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_10_nodePvmS_m) #7 - %call29 = tail call i8* @__visc__edge(i8* %call25, i8* %call28, i32 1, i32 0, i32 0, i32 0) #7 - %call30 = tail call i8* @__visc__edge(i8* %call25, i8* %call28, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call28, i32 14, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call28, i32 15, i32 3, i32 0) #7 - %call31 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_11_nodePvmS_m) #7 - %call32 = tail call i8* @__visc__edge(i8* %call28, i8* %call31, i32 1, i32 0, i32 0, i32 0) #7 - %call33 = tail call i8* @__visc__edge(i8* %call28, i8* %call31, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call31, i32 16, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call31, i32 17, i32 3, i32 0) #7 - %call34 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_12_nodePvm) #7 - %call35 = tail call i8* @__visc__edge(i8* %call31, i8* %call34, i32 1, i32 0, i32 0, i32 0) #7 - %call36 = tail call i8* @__visc__edge(i8* %call31, i8* %call34, i32 1, i32 1, i32 1, i32 0) #7 - %call37 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_13_nodePvm) #7 - %call38 = tail call i8* @__visc__edge(i8* %call34, i8* %call37, i32 1, i32 0, i32 0, i32 0) #7 - %call39 = tail call i8* @__visc__edge(i8* %call34, i8* %call37, i32 1, i32 1, i32 1, i32 0) #7 - %call40 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_14_nodePvmS_m) #7 - %call41 = tail call i8* @__visc__edge(i8* %call37, i8* %call40, i32 1, i32 0, i32 0, i32 0) #7 - %call42 = tail call i8* @__visc__edge(i8* %call37, i8* %call40, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call40, i32 18, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call40, i32 19, i32 3, i32 0) #7 - %call43 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_15_nodePvmS_m) #7 - %call44 = tail call i8* @__visc__edge(i8* %call40, i8* %call43, i32 1, i32 0, i32 0, i32 0) #7 - %call45 = tail call i8* @__visc__edge(i8* %call40, i8* %call43, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call43, i32 20, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call43, i32 21, i32 3, i32 0) #7 - %call46 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_16_nodePvm) #7 - %call47 = tail call i8* @__visc__edge(i8* %call43, i8* %call46, i32 1, i32 0, i32 0, i32 0) #7 - %call48 = tail call i8* @__visc__edge(i8* %call43, i8* %call46, i32 1, i32 1, i32 1, i32 0) #7 - %call49 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_17_nodePvmS_m) #7 - %call50 = tail call i8* @__visc__edge(i8* %call46, i8* %call49, i32 1, i32 0, i32 0, i32 0) #7 - %call51 = tail call i8* @__visc__edge(i8* %call46, i8* %call49, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call49, i32 22, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call49, i32 23, i32 3, i32 0) #7 - %call52 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_18_nodePvmS_m) #7 - %call53 = tail call i8* @__visc__edge(i8* %call49, i8* %call52, i32 1, i32 0, i32 0, i32 0) #7 - %call54 = tail call i8* @__visc__edge(i8* %call49, i8* %call52, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call52, i32 24, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call52, i32 25, i32 3, i32 0) #7 - %call55 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_19_nodePvm) #7 - %call56 = tail call i8* @__visc__edge(i8* %call52, i8* %call55, i32 1, i32 0, i32 0, i32 0) #7 - %call57 = tail call i8* @__visc__edge(i8* %call52, i8* %call55, i32 1, i32 1, i32 1, i32 0) #7 - %call58 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_20_nodePvmS_m) #7 - %call59 = tail call i8* @__visc__edge(i8* %call55, i8* %call58, i32 1, i32 0, i32 0, i32 0) #7 - %call60 = tail call i8* @__visc__edge(i8* %call55, i8* %call58, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call58, i32 26, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call58, i32 27, i32 3, i32 0) #7 - %call61 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_21_nodePvmS_m) #7 - %call62 = tail call i8* @__visc__edge(i8* %call58, i8* %call61, i32 1, i32 0, i32 0, i32 0) #7 - %call63 = tail call i8* @__visc__edge(i8* %call58, i8* %call61, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call61, i32 28, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call61, i32 29, i32 3, i32 0) #7 - %call64 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_22_nodePvm) #7 - %call65 = tail call i8* @__visc__edge(i8* %call61, i8* %call64, i32 1, i32 0, i32 0, i32 0) #7 - %call66 = tail call i8* @__visc__edge(i8* %call61, i8* %call64, i32 1, i32 1, i32 1, i32 0) #7 - %call67 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_23_nodePvm) #7 - %call68 = tail call i8* @__visc__edge(i8* %call64, i8* %call67, i32 1, i32 0, i32 0, i32 0) #7 - %call69 = tail call i8* @__visc__edge(i8* %call64, i8* %call67, i32 1, i32 1, i32 1, i32 0) #7 - %call70 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_24_nodePvmS_m) #7 - %call71 = tail call i8* @__visc__edge(i8* %call67, i8* %call70, i32 1, i32 0, i32 0, i32 0) #7 - %call72 = tail call i8* @__visc__edge(i8* %call67, i8* %call70, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call70, i32 30, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call70, i32 31, i32 3, i32 0) #7 - %call73 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_25_nodePvmS_m) #7 - %call74 = tail call i8* @__visc__edge(i8* %call70, i8* %call73, i32 1, i32 0, i32 0, i32 0) #7 - %call75 = tail call i8* @__visc__edge(i8* %call70, i8* %call73, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call73, i32 32, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call73, i32 33, i32 3, i32 0) #7 - %call76 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_26_nodePvm) #7 - %call77 = tail call i8* @__visc__edge(i8* %call73, i8* %call76, i32 1, i32 0, i32 0, i32 0) #7 - %call78 = tail call i8* @__visc__edge(i8* %call73, i8* %call76, i32 1, i32 1, i32 1, i32 0) #7 - %call79 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_27_nodePvmS_m) #7 - %call80 = tail call i8* @__visc__edge(i8* %call76, i8* %call79, i32 1, i32 0, i32 0, i32 0) #7 - %call81 = tail call i8* @__visc__edge(i8* %call76, i8* %call79, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call79, i32 34, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call79, i32 35, i32 3, i32 0) #7 - %call82 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_28_nodePvmS_m) #7 - %call83 = tail call i8* @__visc__edge(i8* %call79, i8* %call82, i32 1, i32 0, i32 0, i32 0) #7 - %call84 = tail call i8* @__visc__edge(i8* %call79, i8* %call82, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call82, i32 36, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call82, i32 37, i32 3, i32 0) #7 - %call85 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_29_nodePvm) #7 - %call86 = tail call i8* @__visc__edge(i8* %call82, i8* %call85, i32 1, i32 0, i32 0, i32 0) #7 - %call87 = tail call i8* @__visc__edge(i8* %call82, i8* %call85, i32 1, i32 1, i32 1, i32 0) #7 - %call88 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_30_nodePvmS_m) #7 - %call89 = tail call i8* @__visc__edge(i8* %call85, i8* %call88, i32 1, i32 0, i32 0, i32 0) #7 - %call90 = tail call i8* @__visc__edge(i8* %call85, i8* %call88, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call88, i32 38, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call88, i32 39, i32 3, i32 0) #7 - %call91 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_31_nodePvmS_m) #7 - %call92 = tail call i8* @__visc__edge(i8* %call88, i8* %call91, i32 1, i32 0, i32 0, i32 0) #7 - %call93 = tail call i8* @__visc__edge(i8* %call88, i8* %call91, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call91, i32 40, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call91, i32 41, i32 3, i32 0) #7 - %call94 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_32_nodePvm) #7 - %call95 = tail call i8* @__visc__edge(i8* %call91, i8* %call94, i32 1, i32 0, i32 0, i32 0) #7 - %call96 = tail call i8* @__visc__edge(i8* %call91, i8* %call94, i32 1, i32 1, i32 1, i32 0) #7 - %call97 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_33_nodePvm) #7 - %call98 = tail call i8* @__visc__edge(i8* %call94, i8* %call97, i32 1, i32 0, i32 0, i32 0) #7 - %call99 = tail call i8* @__visc__edge(i8* %call94, i8* %call97, i32 1, i32 1, i32 1, i32 0) #7 - %call100 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_34_nodePvmS_m) #7 - %call101 = tail call i8* @__visc__edge(i8* %call97, i8* %call100, i32 1, i32 0, i32 0, i32 0) #7 - %call102 = tail call i8* @__visc__edge(i8* %call97, i8* %call100, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call100, i32 42, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call100, i32 43, i32 3, i32 0) #7 - %call103 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_35_nodePvmS_m) #7 - %call104 = tail call i8* @__visc__edge(i8* %call100, i8* %call103, i32 1, i32 0, i32 0, i32 0) #7 - %call105 = tail call i8* @__visc__edge(i8* %call100, i8* %call103, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call103, i32 44, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call103, i32 45, i32 3, i32 0) #7 - %call106 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_36_nodePvm) #7 - %call107 = tail call i8* @__visc__edge(i8* %call103, i8* %call106, i32 1, i32 0, i32 0, i32 0) #7 - %call108 = tail call i8* @__visc__edge(i8* %call103, i8* %call106, i32 1, i32 1, i32 1, i32 0) #7 - %call109 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_37_nodePvmS_m) #7 - %call110 = tail call i8* @__visc__edge(i8* %call106, i8* %call109, i32 1, i32 0, i32 0, i32 0) #7 - %call111 = tail call i8* @__visc__edge(i8* %call106, i8* %call109, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call109, i32 46, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call109, i32 47, i32 3, i32 0) #7 - %call112 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_38_nodePvmS_m) #7 - %call113 = tail call i8* @__visc__edge(i8* %call109, i8* %call112, i32 1, i32 0, i32 0, i32 0) #7 - %call114 = tail call i8* @__visc__edge(i8* %call109, i8* %call112, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call112, i32 48, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call112, i32 49, i32 3, i32 0) #7 - %call115 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_39_nodePvm) #7 - %call116 = tail call i8* @__visc__edge(i8* %call112, i8* %call115, i32 1, i32 0, i32 0, i32 0) #7 - %call117 = tail call i8* @__visc__edge(i8* %call112, i8* %call115, i32 1, i32 1, i32 1, i32 0) #7 - %call118 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_40_nodePvmS_m) #7 - %call119 = tail call i8* @__visc__edge(i8* %call115, i8* %call118, i32 1, i32 0, i32 0, i32 0) #7 - %call120 = tail call i8* @__visc__edge(i8* %call115, i8* %call118, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call118, i32 50, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call118, i32 51, i32 3, i32 0) #7 - %call121 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_41_nodePvmS_m) #7 - %call122 = tail call i8* @__visc__edge(i8* %call118, i8* %call121, i32 1, i32 0, i32 0, i32 0) #7 - %call123 = tail call i8* @__visc__edge(i8* %call118, i8* %call121, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call121, i32 52, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call121, i32 53, i32 3, i32 0) #7 - %call124 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_42_nodePvm) #7 - %call125 = tail call i8* @__visc__edge(i8* %call121, i8* %call124, i32 1, i32 0, i32 0, i32 0) #7 - %call126 = tail call i8* @__visc__edge(i8* %call121, i8* %call124, i32 1, i32 1, i32 1, i32 0) #7 - %call127 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_43_nodePvm) #7 - %call128 = tail call i8* @__visc__edge(i8* %call124, i8* %call127, i32 1, i32 0, i32 0, i32 0) #7 - %call129 = tail call i8* @__visc__edge(i8* %call124, i8* %call127, i32 1, i32 1, i32 1, i32 0) #7 - %call130 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_44_nodePvmS_m) #7 - %call131 = tail call i8* @__visc__edge(i8* %call127, i8* %call130, i32 1, i32 0, i32 0, i32 0) #7 - %call132 = tail call i8* @__visc__edge(i8* %call127, i8* %call130, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call130, i32 54, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call130, i32 55, i32 3, i32 0) #7 - %call133 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_45_nodePvmS_m) #7 - %call134 = tail call i8* @__visc__edge(i8* %call130, i8* %call133, i32 1, i32 0, i32 0, i32 0) #7 - %call135 = tail call i8* @__visc__edge(i8* %call130, i8* %call133, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call133, i32 56, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call133, i32 57, i32 3, i32 0) #7 - %call136 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_46_nodePvm) #7 - %call137 = tail call i8* @__visc__edge(i8* %call133, i8* %call136, i32 1, i32 0, i32 0, i32 0) #7 - %call138 = tail call i8* @__visc__edge(i8* %call133, i8* %call136, i32 1, i32 1, i32 1, i32 0) #7 - %call139 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_47_nodePvmS_m) #7 - %call140 = tail call i8* @__visc__edge(i8* %call136, i8* %call139, i32 1, i32 0, i32 0, i32 0) #7 - %call141 = tail call i8* @__visc__edge(i8* %call136, i8* %call139, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call139, i32 58, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call139, i32 59, i32 3, i32 0) #7 - %call142 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_48_nodePvmS_m) #7 - %call143 = tail call i8* @__visc__edge(i8* %call139, i8* %call142, i32 1, i32 0, i32 0, i32 0) #7 - %call144 = tail call i8* @__visc__edge(i8* %call139, i8* %call142, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindIn(i8* %call142, i32 60, i32 2, i32 0) #7 - tail call void @__visc__bindIn(i8* %call142, i32 61, i32 3, i32 0) #7 - %call145 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_49_nodePvm) #7 - %call146 = tail call i8* @__visc__edge(i8* %call142, i8* %call145, i32 1, i32 0, i32 0, i32 0) #7 - %call147 = tail call i8* @__visc__edge(i8* %call142, i8* %call145, i32 1, i32 1, i32 1, i32 0) #7 - tail call void @__visc__bindOut(i8* %call145, i32 0, i32 0, i32 0) #7 - tail call void @__visc__bindOut(i8* %call145, i32 1, i32 1, i32 0) #7 + tail call void @__visc__hint(i32 1) #2 + tail call void (i32, ...) @__visc__attributes(i32 31, i8* %input, i8* %conv2d_1_w, i8* %conv2d_1_b, i8* %conv2d_2_w, i8* %conv2d_2_b, i8* %conv2d_3_w, i8* %conv2d_3_b, i8* %conv2d_4_w, i8* %conv2d_4_b, i8* %conv2d_5_w, i8* %conv2d_5_b, i8* %conv2d_6_w, i8* %conv2d_6_b, i8* %conv2d_7_w, i8* %conv2d_7_b, i8* %conv2d_8_w, i8* %conv2d_8_b, i8* %conv2d_9_w, i8* %conv2d_9_b, i8* %conv2d_10_w, i8* %conv2d_10_b, i8* %conv2d_11_w, i8* %conv2d_11_b, i8* %conv2d_12_w, i8* %conv2d_12_b, i8* %conv2d_13_w, i8* %conv2d_13_b, i8* %dense_1_w, i8* %dense_1_b, i8* %dense_2_w, i8* %dense_2_b, i32 0) #2 + %call = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_0_nodePvmS_m) #2 + tail call void @__visc__bindIn(i8* %call, i32 0, i32 0, i32 0) #2 + tail call void @__visc__bindIn(i8* %call, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call, i32 2, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call, i32 3, i32 3, i32 0) #2 + %call1 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_1_nodePvmS_m) #2 + %call2 = tail call i8* @__visc__edge(i8* %call, i8* %call1, i32 1, i32 0, i32 0, i32 0) #2 + %call3 = tail call i8* @__visc__edge(i8* %call, i8* %call1, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call1, i32 4, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call1, i32 5, i32 3, i32 0) #2 + %call4 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_2_nodePvm) #2 + %call5 = tail call i8* @__visc__edge(i8* %call1, i8* %call4, i32 1, i32 0, i32 0, i32 0) #2 + %call6 = tail call i8* @__visc__edge(i8* %call1, i8* %call4, i32 1, i32 1, i32 1, i32 0) #2 + %call7 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_3_nodePvmS_m) #2 + %call8 = tail call i8* @__visc__edge(i8* %call4, i8* %call7, i32 1, i32 0, i32 0, i32 0) #2 + %call9 = tail call i8* @__visc__edge(i8* %call4, i8* %call7, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call7, i32 6, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call7, i32 7, i32 3, i32 0) #2 + %call10 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_4_nodePvmS_m) #2 + %call11 = tail call i8* @__visc__edge(i8* %call7, i8* %call10, i32 1, i32 0, i32 0, i32 0) #2 + %call12 = tail call i8* @__visc__edge(i8* %call7, i8* %call10, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call10, i32 8, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call10, i32 9, i32 3, i32 0) #2 + %call13 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_5_nodePvm) #2 + %call14 = tail call i8* @__visc__edge(i8* %call10, i8* %call13, i32 1, i32 0, i32 0, i32 0) #2 + %call15 = tail call i8* @__visc__edge(i8* %call10, i8* %call13, i32 1, i32 1, i32 1, i32 0) #2 + %call16 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_6_nodePvm) #2 + %call17 = tail call i8* @__visc__edge(i8* %call13, i8* %call16, i32 1, i32 0, i32 0, i32 0) #2 + %call18 = tail call i8* @__visc__edge(i8* %call13, i8* %call16, i32 1, i32 1, i32 1, i32 0) #2 + %call19 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_7_nodePvmS_m) #2 + %call20 = tail call i8* @__visc__edge(i8* %call16, i8* %call19, i32 1, i32 0, i32 0, i32 0) #2 + %call21 = tail call i8* @__visc__edge(i8* %call16, i8* %call19, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call19, i32 10, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call19, i32 11, i32 3, i32 0) #2 + %call22 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z10var_8_nodePvmS_m) #2 + %call23 = tail call i8* @__visc__edge(i8* %call19, i8* %call22, i32 1, i32 0, i32 0, i32 0) #2 + %call24 = tail call i8* @__visc__edge(i8* %call19, i8* %call22, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call22, i32 12, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call22, i32 13, i32 3, i32 0) #2 + %call25 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z10var_9_nodePvm) #2 + %call26 = tail call i8* @__visc__edge(i8* %call22, i8* %call25, i32 1, i32 0, i32 0, i32 0) #2 + %call27 = tail call i8* @__visc__edge(i8* %call22, i8* %call25, i32 1, i32 1, i32 1, i32 0) #2 + %call28 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_10_nodePvmS_m) #2 + %call29 = tail call i8* @__visc__edge(i8* %call25, i8* %call28, i32 1, i32 0, i32 0, i32 0) #2 + %call30 = tail call i8* @__visc__edge(i8* %call25, i8* %call28, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call28, i32 14, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call28, i32 15, i32 3, i32 0) #2 + %call31 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_11_nodePvmS_m) #2 + %call32 = tail call i8* @__visc__edge(i8* %call28, i8* %call31, i32 1, i32 0, i32 0, i32 0) #2 + %call33 = tail call i8* @__visc__edge(i8* %call28, i8* %call31, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call31, i32 16, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call31, i32 17, i32 3, i32 0) #2 + %call34 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_12_nodePvm) #2 + %call35 = tail call i8* @__visc__edge(i8* %call31, i8* %call34, i32 1, i32 0, i32 0, i32 0) #2 + %call36 = tail call i8* @__visc__edge(i8* %call31, i8* %call34, i32 1, i32 1, i32 1, i32 0) #2 + %call37 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_13_nodePvm) #2 + %call38 = tail call i8* @__visc__edge(i8* %call34, i8* %call37, i32 1, i32 0, i32 0, i32 0) #2 + %call39 = tail call i8* @__visc__edge(i8* %call34, i8* %call37, i32 1, i32 1, i32 1, i32 0) #2 + %call40 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_14_nodePvmS_m) #2 + %call41 = tail call i8* @__visc__edge(i8* %call37, i8* %call40, i32 1, i32 0, i32 0, i32 0) #2 + %call42 = tail call i8* @__visc__edge(i8* %call37, i8* %call40, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call40, i32 18, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call40, i32 19, i32 3, i32 0) #2 + %call43 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_15_nodePvmS_m) #2 + %call44 = tail call i8* @__visc__edge(i8* %call40, i8* %call43, i32 1, i32 0, i32 0, i32 0) #2 + %call45 = tail call i8* @__visc__edge(i8* %call40, i8* %call43, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call43, i32 20, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call43, i32 21, i32 3, i32 0) #2 + %call46 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_16_nodePvm) #2 + %call47 = tail call i8* @__visc__edge(i8* %call43, i8* %call46, i32 1, i32 0, i32 0, i32 0) #2 + %call48 = tail call i8* @__visc__edge(i8* %call43, i8* %call46, i32 1, i32 1, i32 1, i32 0) #2 + %call49 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_17_nodePvmS_m) #2 + %call50 = tail call i8* @__visc__edge(i8* %call46, i8* %call49, i32 1, i32 0, i32 0, i32 0) #2 + %call51 = tail call i8* @__visc__edge(i8* %call46, i8* %call49, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call49, i32 22, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call49, i32 23, i32 3, i32 0) #2 + %call52 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_18_nodePvmS_m) #2 + %call53 = tail call i8* @__visc__edge(i8* %call49, i8* %call52, i32 1, i32 0, i32 0, i32 0) #2 + %call54 = tail call i8* @__visc__edge(i8* %call49, i8* %call52, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call52, i32 24, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call52, i32 25, i32 3, i32 0) #2 + %call55 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_19_nodePvm) #2 + %call56 = tail call i8* @__visc__edge(i8* %call52, i8* %call55, i32 1, i32 0, i32 0, i32 0) #2 + %call57 = tail call i8* @__visc__edge(i8* %call52, i8* %call55, i32 1, i32 1, i32 1, i32 0) #2 + %call58 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_20_nodePvmS_m) #2 + %call59 = tail call i8* @__visc__edge(i8* %call55, i8* %call58, i32 1, i32 0, i32 0, i32 0) #2 + %call60 = tail call i8* @__visc__edge(i8* %call55, i8* %call58, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call58, i32 26, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call58, i32 27, i32 3, i32 0) #2 + %call61 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_21_nodePvmS_m) #2 + %call62 = tail call i8* @__visc__edge(i8* %call58, i8* %call61, i32 1, i32 0, i32 0, i32 0) #2 + %call63 = tail call i8* @__visc__edge(i8* %call58, i8* %call61, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call61, i32 28, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call61, i32 29, i32 3, i32 0) #2 + %call64 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_22_nodePvm) #2 + %call65 = tail call i8* @__visc__edge(i8* %call61, i8* %call64, i32 1, i32 0, i32 0, i32 0) #2 + %call66 = tail call i8* @__visc__edge(i8* %call61, i8* %call64, i32 1, i32 1, i32 1, i32 0) #2 + %call67 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_23_nodePvm) #2 + %call68 = tail call i8* @__visc__edge(i8* %call64, i8* %call67, i32 1, i32 0, i32 0, i32 0) #2 + %call69 = tail call i8* @__visc__edge(i8* %call64, i8* %call67, i32 1, i32 1, i32 1, i32 0) #2 + %call70 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_24_nodePvmS_m) #2 + %call71 = tail call i8* @__visc__edge(i8* %call67, i8* %call70, i32 1, i32 0, i32 0, i32 0) #2 + %call72 = tail call i8* @__visc__edge(i8* %call67, i8* %call70, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call70, i32 30, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call70, i32 31, i32 3, i32 0) #2 + %call73 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_25_nodePvmS_m) #2 + %call74 = tail call i8* @__visc__edge(i8* %call70, i8* %call73, i32 1, i32 0, i32 0, i32 0) #2 + %call75 = tail call i8* @__visc__edge(i8* %call70, i8* %call73, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call73, i32 32, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call73, i32 33, i32 3, i32 0) #2 + %call76 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_26_nodePvm) #2 + %call77 = tail call i8* @__visc__edge(i8* %call73, i8* %call76, i32 1, i32 0, i32 0, i32 0) #2 + %call78 = tail call i8* @__visc__edge(i8* %call73, i8* %call76, i32 1, i32 1, i32 1, i32 0) #2 + %call79 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_27_nodePvmS_m) #2 + %call80 = tail call i8* @__visc__edge(i8* %call76, i8* %call79, i32 1, i32 0, i32 0, i32 0) #2 + %call81 = tail call i8* @__visc__edge(i8* %call76, i8* %call79, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call79, i32 34, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call79, i32 35, i32 3, i32 0) #2 + %call82 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_28_nodePvmS_m) #2 + %call83 = tail call i8* @__visc__edge(i8* %call79, i8* %call82, i32 1, i32 0, i32 0, i32 0) #2 + %call84 = tail call i8* @__visc__edge(i8* %call79, i8* %call82, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call82, i32 36, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call82, i32 37, i32 3, i32 0) #2 + %call85 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_29_nodePvm) #2 + %call86 = tail call i8* @__visc__edge(i8* %call82, i8* %call85, i32 1, i32 0, i32 0, i32 0) #2 + %call87 = tail call i8* @__visc__edge(i8* %call82, i8* %call85, i32 1, i32 1, i32 1, i32 0) #2 + %call88 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_30_nodePvmS_m) #2 + %call89 = tail call i8* @__visc__edge(i8* %call85, i8* %call88, i32 1, i32 0, i32 0, i32 0) #2 + %call90 = tail call i8* @__visc__edge(i8* %call85, i8* %call88, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call88, i32 38, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call88, i32 39, i32 3, i32 0) #2 + %call91 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_31_nodePvmS_m) #2 + %call92 = tail call i8* @__visc__edge(i8* %call88, i8* %call91, i32 1, i32 0, i32 0, i32 0) #2 + %call93 = tail call i8* @__visc__edge(i8* %call88, i8* %call91, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call91, i32 40, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call91, i32 41, i32 3, i32 0) #2 + %call94 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_32_nodePvm) #2 + %call95 = tail call i8* @__visc__edge(i8* %call91, i8* %call94, i32 1, i32 0, i32 0, i32 0) #2 + %call96 = tail call i8* @__visc__edge(i8* %call91, i8* %call94, i32 1, i32 1, i32 1, i32 0) #2 + %call97 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_33_nodePvm) #2 + %call98 = tail call i8* @__visc__edge(i8* %call94, i8* %call97, i32 1, i32 0, i32 0, i32 0) #2 + %call99 = tail call i8* @__visc__edge(i8* %call94, i8* %call97, i32 1, i32 1, i32 1, i32 0) #2 + %call100 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_34_nodePvmS_m) #2 + %call101 = tail call i8* @__visc__edge(i8* %call97, i8* %call100, i32 1, i32 0, i32 0, i32 0) #2 + %call102 = tail call i8* @__visc__edge(i8* %call97, i8* %call100, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call100, i32 42, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call100, i32 43, i32 3, i32 0) #2 + %call103 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_35_nodePvmS_m) #2 + %call104 = tail call i8* @__visc__edge(i8* %call100, i8* %call103, i32 1, i32 0, i32 0, i32 0) #2 + %call105 = tail call i8* @__visc__edge(i8* %call100, i8* %call103, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call103, i32 44, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call103, i32 45, i32 3, i32 0) #2 + %call106 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_36_nodePvm) #2 + %call107 = tail call i8* @__visc__edge(i8* %call103, i8* %call106, i32 1, i32 0, i32 0, i32 0) #2 + %call108 = tail call i8* @__visc__edge(i8* %call103, i8* %call106, i32 1, i32 1, i32 1, i32 0) #2 + %call109 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_37_nodePvmS_m) #2 + %call110 = tail call i8* @__visc__edge(i8* %call106, i8* %call109, i32 1, i32 0, i32 0, i32 0) #2 + %call111 = tail call i8* @__visc__edge(i8* %call106, i8* %call109, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call109, i32 46, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call109, i32 47, i32 3, i32 0) #2 + %call112 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_38_nodePvmS_m) #2 + %call113 = tail call i8* @__visc__edge(i8* %call109, i8* %call112, i32 1, i32 0, i32 0, i32 0) #2 + %call114 = tail call i8* @__visc__edge(i8* %call109, i8* %call112, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call112, i32 48, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call112, i32 49, i32 3, i32 0) #2 + %call115 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_39_nodePvm) #2 + %call116 = tail call i8* @__visc__edge(i8* %call112, i8* %call115, i32 1, i32 0, i32 0, i32 0) #2 + %call117 = tail call i8* @__visc__edge(i8* %call112, i8* %call115, i32 1, i32 1, i32 1, i32 0) #2 + %call118 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_40_nodePvmS_m) #2 + %call119 = tail call i8* @__visc__edge(i8* %call115, i8* %call118, i32 1, i32 0, i32 0, i32 0) #2 + %call120 = tail call i8* @__visc__edge(i8* %call115, i8* %call118, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call118, i32 50, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call118, i32 51, i32 3, i32 0) #2 + %call121 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_41_nodePvmS_m) #2 + %call122 = tail call i8* @__visc__edge(i8* %call118, i8* %call121, i32 1, i32 0, i32 0, i32 0) #2 + %call123 = tail call i8* @__visc__edge(i8* %call118, i8* %call121, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call121, i32 52, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call121, i32 53, i32 3, i32 0) #2 + %call124 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_42_nodePvm) #2 + %call125 = tail call i8* @__visc__edge(i8* %call121, i8* %call124, i32 1, i32 0, i32 0, i32 0) #2 + %call126 = tail call i8* @__visc__edge(i8* %call121, i8* %call124, i32 1, i32 1, i32 1, i32 0) #2 + %call127 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_43_nodePvm) #2 + %call128 = tail call i8* @__visc__edge(i8* %call124, i8* %call127, i32 1, i32 0, i32 0, i32 0) #2 + %call129 = tail call i8* @__visc__edge(i8* %call124, i8* %call127, i32 1, i32 1, i32 1, i32 0) #2 + %call130 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_44_nodePvmS_m) #2 + %call131 = tail call i8* @__visc__edge(i8* %call127, i8* %call130, i32 1, i32 0, i32 0, i32 0) #2 + %call132 = tail call i8* @__visc__edge(i8* %call127, i8* %call130, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call130, i32 54, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call130, i32 55, i32 3, i32 0) #2 + %call133 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_45_nodePvmS_m) #2 + %call134 = tail call i8* @__visc__edge(i8* %call130, i8* %call133, i32 1, i32 0, i32 0, i32 0) #2 + %call135 = tail call i8* @__visc__edge(i8* %call130, i8* %call133, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call133, i32 56, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call133, i32 57, i32 3, i32 0) #2 + %call136 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_46_nodePvm) #2 + %call137 = tail call i8* @__visc__edge(i8* %call133, i8* %call136, i32 1, i32 0, i32 0, i32 0) #2 + %call138 = tail call i8* @__visc__edge(i8* %call133, i8* %call136, i32 1, i32 1, i32 1, i32 0) #2 + %call139 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_47_nodePvmS_m) #2 + %call140 = tail call i8* @__visc__edge(i8* %call136, i8* %call139, i32 1, i32 0, i32 0, i32 0) #2 + %call141 = tail call i8* @__visc__edge(i8* %call136, i8* %call139, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call139, i32 58, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call139, i32 59, i32 3, i32 0) #2 + %call142 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64, i8*, i64)* nonnull @_Z11var_48_nodePvmS_m) #2 + %call143 = tail call i8* @__visc__edge(i8* %call139, i8* %call142, i32 1, i32 0, i32 0, i32 0) #2 + %call144 = tail call i8* @__visc__edge(i8* %call139, i8* %call142, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindIn(i8* %call142, i32 60, i32 2, i32 0) #2 + tail call void @__visc__bindIn(i8* %call142, i32 61, i32 3, i32 0) #2 + %call145 = tail call i8* (i32, ...) @__visc__createNodeND(i32 0, void (i8*, i64)* nonnull @_Z11var_49_nodePvm) #2 + %call146 = tail call i8* @__visc__edge(i8* %call142, i8* %call145, i32 1, i32 0, i32 0, i32 0) #2 + %call147 = tail call i8* @__visc__edge(i8* %call142, i8* %call145, i32 1, i32 1, i32 1, i32 0) #2 + tail call void @__visc__bindOut(i8* %call145, i32 0, i32 0, i32 0) #2 + tail call void @__visc__bindOut(i8* %call145, i32 1, i32 1, i32 0) #2 ret void } -declare i8* @__visc__createNodeND(i32, ...) local_unnamed_addr #3 +declare i8* @__visc__createNodeND(i32, ...) local_unnamed_addr #0 -declare void @__visc__bindIn(i8*, i32, i32, i32) local_unnamed_addr #3 +declare void @__visc__bindIn(i8*, i32, i32, i32) local_unnamed_addr #0 -declare i8* @__visc__edge(i8*, i8*, i32, i32, i32, i32) local_unnamed_addr #3 +declare i8* @__visc__edge(i8*, i8*, i32, i32, i32, i32) local_unnamed_addr #0 -declare void @__visc__bindOut(i8*, i32, i32, i32) local_unnamed_addr #3 +declare void @__visc__bindOut(i8*, i32, i32, i32) local_unnamed_addr #0 ; Function Attrs: norecurse nounwind uwtable -define i32 @main() local_unnamed_addr #5 { +define i32 @main() local_unnamed_addr #6 { entry: %__dnew.i.i.i.i = alloca i64, align 8 %dir_prefix = alloca %"class.std::__cxx11::basic_string", align 8 @@ -2727,2750 +5442,3223 @@ entry: %dense_2_b_path = alloca %"class.std::__cxx11::basic_string", align 8 %ref.tmp120 = alloca %"class.std::__cxx11::basic_string", align 8 %0 = bitcast %"class.std::__cxx11::basic_string"* %dir_prefix to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %0) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %0) #2 %1 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 2 %2 = bitcast %"class.std::__cxx11::basic_string"* %dir_prefix to %union.anon** - store %union.anon* %1, %union.anon** %2, align 8, !tbaa !52 + store %union.anon* %1, %union.anon** %2, align 8, !tbaa !58 %3 = bitcast %union.anon* %1 to i8* %4 = bitcast i64* %__dnew.i.i.i.i to i8* - call void @llvm.lifetime.start(i64 8, i8* nonnull %4) #7 - store i64 71, i64* %__dnew.i.i.i.i, align 8, !tbaa !12 - %call5.i.i.i.i = call i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"* nonnull %dir_prefix, i64* nonnull dereferenceable(8) %__dnew.i.i.i.i, i64 0) #7 + call void @llvm.lifetime.start(i64 8, i8* nonnull %4) #2 + store i64 69, i64* %__dnew.i.i.i.i, align 8, !tbaa !15 + %call5.i.i.i.i = call i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"* nonnull %dir_prefix, i64* nonnull dereferenceable(8) %__dnew.i.i.i.i, i64 0) #2 %_M_p.i13.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 0, i32 0 - store i8* %call5.i.i.i.i, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56 - %5 = load i64, i64* %__dnew.i.i.i.i, align 8, !tbaa !12 + store i8* %call5.i.i.i.i, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62 + %5 = load i64, i64* %__dnew.i.i.i.i, align 8, !tbaa !15 %_M_allocated_capacity.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 2, i32 0 - store i64 %5, i64* %_M_allocated_capacity.i.i.i.i.i, align 8, !tbaa !12 - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call5.i.i.i.i, i8* nonnull getelementptr inbounds ([72 x i8], [72 x i8]* @.str.23, i64 0, i64 0), i64 71, i32 1, i1 false) #7 + store i64 %5, i64* %_M_allocated_capacity.i.i.i.i.i, align 8, !tbaa !15 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call5.i.i.i.i, i8* nonnull getelementptr inbounds ([70 x i8], [70 x i8]* @.str.41, i64 0, i64 0), i64 69, i32 1, i1 false) #2 %_M_string_length.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 1 - store i64 %5, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53 + store i64 %5, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59 %arrayidx.i.i.i.i.i = getelementptr inbounds i8, i8* %call5.i.i.i.i, i64 %5 - store i8 0, i8* %arrayidx.i.i.i.i.i, align 1, !tbaa !36 - call void @llvm.lifetime.end(i64 8, i8* nonnull %4) #7 + store i8 0, i8* %arrayidx.i.i.i.i.i, align 1, !tbaa !42 + call void @llvm.lifetime.end(i64 8, i8* nonnull %4) #2 %6 = bitcast %"class.std::__cxx11::basic_string"* %input_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %6) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %6) #2 %7 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp1 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %7) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %7) #2 %8 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 2 %9 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp1 to %union.anon** - store %union.anon* %8, %union.anon** %9, align 8, !tbaa !52 + store %union.anon* %8, %union.anon** %9, align 8, !tbaa !58 %10 = bitcast %union.anon* %8 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %10, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @.str.24, i64 0, i64 0), i64 9, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i279 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 1 - store i64 9, i64* %_M_string_length.i.i.i.i.i.i279, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %10, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @.str.42, i64 0, i64 0), i64 9, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 1 + store i64 9, i64* %_M_string_length.i.i.i.i.i.i287, align 8, !tbaa !59 %11 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 2, i32 1, i64 1 - store i8 0, i8* %11, align 1, !tbaa !36 - %12 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !62 - %13 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !62 - %call3.i.i.i = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp1, i64 0, i64 0, i8* %13, i64 %12) #7, !noalias !62 + store i8 0, i8* %11, align 1, !tbaa !42 + %12 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !80 + %13 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !80 + %call3.i.i.i = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp1, i64 0, i64 0, i8* %13, i64 %12) #2, !noalias !80 %14 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 2 %15 = bitcast %"class.std::__cxx11::basic_string"* %input_path to %union.anon** - store %union.anon* %14, %union.anon** %15, align 8, !tbaa !52, !alias.scope !62 + store %union.anon* %14, %union.anon** %15, align 8, !tbaa !58, !alias.scope !80 %_M_p.i.i23.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 0, i32 0 - %16 = load i8*, i8** %_M_p.i.i23.i.i, align 8, !tbaa !56 + %16 = load i8*, i8** %_M_p.i.i23.i.i, align 8, !tbaa !62 %17 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 2 - %arraydecay.i.i.i.i282 = bitcast %union.anon* %17 to i8* - %cmp.i.i.i283 = icmp eq i8* %16, %arraydecay.i.i.i.i282 - br i1 %cmp.i.i.i283, label %if.then.i.i284, label %if.else.i.i + %arraydecay.i.i.i.i290 = bitcast %union.anon* %17 to i8* + %cmp.i.i.i291 = icmp eq i8* %16, %arraydecay.i.i.i.i290 + br i1 %cmp.i.i.i291, label %if.then.i.i292, label %if.else.i.i -if.then.i.i284: ; preds = %entry +if.then.i.i292: ; preds = %entry %arraydecay.i.i.i = bitcast %union.anon* %14 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i, i8* %16, i64 16, i32 1, i1 false) #7 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i, i8* %16, i64 16, i32 1, i1 false) #2 br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit if.else.i.i: ; preds = %entry %_M_p.i21.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 0, i32 0 - store i8* %16, i8** %_M_p.i21.i.i, align 8, !tbaa !56, !alias.scope !62 + store i8* %16, i8** %_M_p.i21.i.i, align 8, !tbaa !62, !alias.scope !80 %_M_allocated_capacity.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 2, i32 0 - %18 = load i64, i64* %_M_allocated_capacity.i.i, align 8, !tbaa !12 + %18 = load i64, i64* %_M_allocated_capacity.i.i, align 8, !tbaa !15 %_M_allocated_capacity.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 2, i32 0 - store i64 %18, i64* %_M_allocated_capacity.i.i.i, align 8, !tbaa !12, !alias.scope !62 + store i64 %18, i64* %_M_allocated_capacity.i.i.i, align 8, !tbaa !15, !alias.scope !80 br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit: ; preds = %if.then.i.i284, %if.else.i.i +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit: ; preds = %if.then.i.i292, %if.else.i.i %_M_string_length.i20.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 1 - %19 = load i64, i64* %_M_string_length.i20.i.i, align 8, !tbaa !53 + %19 = load i64, i64* %_M_string_length.i20.i.i, align 8, !tbaa !59 %_M_string_length.i.i2.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 1 - store i64 %19, i64* %_M_string_length.i.i2.i, align 8, !tbaa !53, !alias.scope !62 + store i64 %19, i64* %_M_string_length.i.i2.i, align 8, !tbaa !59, !alias.scope !80 %20 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i to %union.anon** - store %union.anon* %17, %union.anon** %20, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i282, align 1, !tbaa !36 - %_M_p.i.i.i.i285 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 0, i32 0 - %21 = load i8*, i8** %_M_p.i.i.i.i285, align 8, !tbaa !56 - %cmp.i.i.i287 = icmp eq i8* %21, %10 - br i1 %cmp.i.i.i287, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289, label %if.then.i.i288 - -if.then.i.i288: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit - call void @_ZdlPv(i8* %21) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit, %if.then.i.i288 - call void @llvm.lifetime.end(i64 32, i8* nonnull %7) #7 + store %union.anon* %17, %union.anon** %20, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i290, align 1, !tbaa !42 + %_M_p.i.i.i.i293 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 0, i32 0 + %21 = load i8*, i8** %_M_p.i.i.i.i293, align 8, !tbaa !62 + %cmp.i.i.i295 = icmp eq i8* %21, %10 + br i1 %cmp.i.i.i295, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297, label %if.then.i.i296 + +if.then.i.i296: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit + call void @_ZdlPv(i8* %21) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit, %if.then.i.i296 + call void @llvm.lifetime.end(i64 32, i8* nonnull %7) #2 %22 = bitcast %"class.std::__cxx11::basic_string"* %labels_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %22) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %22) #2 %23 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp3 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %23) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %23) #2 %24 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 2 %25 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp3 to %union.anon** - store %union.anon* %24, %union.anon** %25, align 8, !tbaa !52 + store %union.anon* %24, %union.anon** %25, align 8, !tbaa !58 %26 = bitcast %union.anon* %24 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %26, i8* nonnull getelementptr inbounds ([11 x i8], [11 x i8]* @.str.25, i64 0, i64 0), i64 10, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i308 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 1 - store i64 10, i64* %_M_string_length.i.i.i.i.i.i308, align 8, !tbaa !53 - %27 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 2, i32 1, i64 2 - store i8 0, i8* %27, align 2, !tbaa !36 - %28 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !65 - %29 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !65 - %call3.i.i.i313 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp3, i64 0, i64 0, i8* %29, i64 %28) #7, !noalias !65 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %26, i8* nonnull getelementptr inbounds ([13 x i8], [13 x i8]* @.str.43, i64 0, i64 0), i64 12, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i316 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 1 + store i64 12, i64* %_M_string_length.i.i.i.i.i.i316, align 8, !tbaa !59 + %27 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 2, i32 1, i64 4 + store i8 0, i8* %27, align 4, !tbaa !42 + %28 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !83 + %29 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !83 + %call3.i.i.i321 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp3, i64 0, i64 0, i8* %29, i64 %28) #2, !noalias !83 %30 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 2 %31 = bitcast %"class.std::__cxx11::basic_string"* %labels_path to %union.anon** - store %union.anon* %30, %union.anon** %31, align 8, !tbaa !52, !alias.scope !65 - %_M_p.i.i23.i.i314 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 0, i32 0 - %32 = load i8*, i8** %_M_p.i.i23.i.i314, align 8, !tbaa !56 - %33 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 2 - %arraydecay.i.i.i.i315 = bitcast %union.anon* %33 to i8* - %cmp.i.i.i316 = icmp eq i8* %32, %arraydecay.i.i.i.i315 - br i1 %cmp.i.i.i316, label %if.then.i.i318, label %if.else.i.i322 - -if.then.i.i318: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289 - %arraydecay.i.i.i317 = bitcast %union.anon* %30 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i317, i8* %32, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325 - -if.else.i.i322: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289 - %_M_p.i21.i.i319 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 - store i8* %32, i8** %_M_p.i21.i.i319, align 8, !tbaa !56, !alias.scope !65 - %_M_allocated_capacity.i.i320 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 2, i32 0 - %34 = load i64, i64* %_M_allocated_capacity.i.i320, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 2, i32 0 - store i64 %34, i64* %_M_allocated_capacity.i.i.i321, align 8, !tbaa !12, !alias.scope !65 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325: ; preds = %if.then.i.i318, %if.else.i.i322 - %_M_string_length.i20.i.i323 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 1 - %35 = load i64, i64* %_M_string_length.i20.i.i323, align 8, !tbaa !53 - %_M_string_length.i.i2.i324 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 1 - store i64 %35, i64* %_M_string_length.i.i2.i324, align 8, !tbaa !53, !alias.scope !65 - %36 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i313 to %union.anon** - store %union.anon* %33, %union.anon** %36, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i323, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i315, align 1, !tbaa !36 - %_M_p.i.i.i.i326 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 0, i32 0 - %37 = load i8*, i8** %_M_p.i.i.i.i326, align 8, !tbaa !56 - %cmp.i.i.i328 = icmp eq i8* %37, %26 - br i1 %cmp.i.i.i328, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330, label %if.then.i.i329 - -if.then.i.i329: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325 - call void @_ZdlPv(i8* %37) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325, %if.then.i.i329 - call void @llvm.lifetime.end(i64 32, i8* nonnull %23) #7 + store %union.anon* %30, %union.anon** %31, align 8, !tbaa !58, !alias.scope !83 + %_M_p.i.i23.i.i322 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 0, i32 0 + %32 = load i8*, i8** %_M_p.i.i23.i.i322, align 8, !tbaa !62 + %33 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 2 + %arraydecay.i.i.i.i323 = bitcast %union.anon* %33 to i8* + %cmp.i.i.i324 = icmp eq i8* %32, %arraydecay.i.i.i.i323 + br i1 %cmp.i.i.i324, label %if.then.i.i326, label %if.else.i.i330 + +if.then.i.i326: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297 + %arraydecay.i.i.i325 = bitcast %union.anon* %30 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i325, i8* %32, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333 + +if.else.i.i330: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297 + %_M_p.i21.i.i327 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 + store i8* %32, i8** %_M_p.i21.i.i327, align 8, !tbaa !62, !alias.scope !83 + %_M_allocated_capacity.i.i328 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 2, i32 0 + %34 = load i64, i64* %_M_allocated_capacity.i.i328, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i329 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 2, i32 0 + store i64 %34, i64* %_M_allocated_capacity.i.i.i329, align 8, !tbaa !15, !alias.scope !83 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333: ; preds = %if.then.i.i326, %if.else.i.i330 + %_M_string_length.i20.i.i331 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 1 + %35 = load i64, i64* %_M_string_length.i20.i.i331, align 8, !tbaa !59 + %_M_string_length.i.i2.i332 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 1 + store i64 %35, i64* %_M_string_length.i.i2.i332, align 8, !tbaa !59, !alias.scope !83 + %36 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i321 to %union.anon** + store %union.anon* %33, %union.anon** %36, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i331, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i323, align 1, !tbaa !42 + %_M_p.i.i.i.i334 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 0, i32 0 + %37 = load i8*, i8** %_M_p.i.i.i.i334, align 8, !tbaa !62 + %cmp.i.i.i336 = icmp eq i8* %37, %26 + br i1 %cmp.i.i.i336, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338, label %if.then.i.i337 + +if.then.i.i337: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333 + call void @_ZdlPv(i8* %37) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333, %if.then.i.i337 + call void @llvm.lifetime.end(i64 32, i8* nonnull %23) #2 %38 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %38) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %38) #2 %39 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp5 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %39) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %39) #2 %40 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 2 %41 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp5 to %union.anon** - store %union.anon* %40, %union.anon** %41, align 8, !tbaa !52 + store %union.anon* %40, %union.anon** %41, align 8, !tbaa !58 %42 = bitcast %union.anon* %40 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %42, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.26, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i349 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i349, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %42, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.44, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i357 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i357, align 8, !tbaa !59 %43 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %43, align 2, !tbaa !36 - %44 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !68 - %45 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !68 - %call3.i.i.i354 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp5, i64 0, i64 0, i8* %45, i64 %44) #7, !noalias !68 + store i8 0, i8* %43, align 2, !tbaa !42 + %44 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !86 + %45 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !86 + %call3.i.i.i362 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp5, i64 0, i64 0, i8* %45, i64 %44) #2, !noalias !86 %46 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 2 %47 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_w_path to %union.anon** - store %union.anon* %46, %union.anon** %47, align 8, !tbaa !52, !alias.scope !68 - %_M_p.i.i23.i.i355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 0, i32 0 - %48 = load i8*, i8** %_M_p.i.i23.i.i355, align 8, !tbaa !56 - %49 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 2 - %arraydecay.i.i.i.i356 = bitcast %union.anon* %49 to i8* - %cmp.i.i.i357 = icmp eq i8* %48, %arraydecay.i.i.i.i356 - br i1 %cmp.i.i.i357, label %if.then.i.i359, label %if.else.i.i363 - -if.then.i.i359: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330 - %arraydecay.i.i.i358 = bitcast %union.anon* %46 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i358, i8* %48, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366 - -if.else.i.i363: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330 - %_M_p.i21.i.i360 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 0, i32 0 - store i8* %48, i8** %_M_p.i21.i.i360, align 8, !tbaa !56, !alias.scope !68 - %_M_allocated_capacity.i.i361 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 2, i32 0 - %50 = load i64, i64* %_M_allocated_capacity.i.i361, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i362 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 2, i32 0 - store i64 %50, i64* %_M_allocated_capacity.i.i.i362, align 8, !tbaa !12, !alias.scope !68 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366: ; preds = %if.then.i.i359, %if.else.i.i363 - %_M_string_length.i20.i.i364 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 1 - %51 = load i64, i64* %_M_string_length.i20.i.i364, align 8, !tbaa !53 - %_M_string_length.i.i2.i365 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 1 - store i64 %51, i64* %_M_string_length.i.i2.i365, align 8, !tbaa !53, !alias.scope !68 - %52 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i354 to %union.anon** - store %union.anon* %49, %union.anon** %52, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i364, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i356, align 1, !tbaa !36 - %_M_p.i.i.i.i367 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 0, i32 0 - %53 = load i8*, i8** %_M_p.i.i.i.i367, align 8, !tbaa !56 - %cmp.i.i.i369 = icmp eq i8* %53, %42 - br i1 %cmp.i.i.i369, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371, label %if.then.i.i370 - -if.then.i.i370: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366 - call void @_ZdlPv(i8* %53) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366, %if.then.i.i370 - call void @llvm.lifetime.end(i64 32, i8* nonnull %39) #7 + store %union.anon* %46, %union.anon** %47, align 8, !tbaa !58, !alias.scope !86 + %_M_p.i.i23.i.i363 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 0, i32 0 + %48 = load i8*, i8** %_M_p.i.i23.i.i363, align 8, !tbaa !62 + %49 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 2 + %arraydecay.i.i.i.i364 = bitcast %union.anon* %49 to i8* + %cmp.i.i.i365 = icmp eq i8* %48, %arraydecay.i.i.i.i364 + br i1 %cmp.i.i.i365, label %if.then.i.i367, label %if.else.i.i371 + +if.then.i.i367: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338 + %arraydecay.i.i.i366 = bitcast %union.anon* %46 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i366, i8* %48, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374 + +if.else.i.i371: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338 + %_M_p.i21.i.i368 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 0, i32 0 + store i8* %48, i8** %_M_p.i21.i.i368, align 8, !tbaa !62, !alias.scope !86 + %_M_allocated_capacity.i.i369 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 2, i32 0 + %50 = load i64, i64* %_M_allocated_capacity.i.i369, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i370 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 2, i32 0 + store i64 %50, i64* %_M_allocated_capacity.i.i.i370, align 8, !tbaa !15, !alias.scope !86 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374: ; preds = %if.then.i.i367, %if.else.i.i371 + %_M_string_length.i20.i.i372 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 1 + %51 = load i64, i64* %_M_string_length.i20.i.i372, align 8, !tbaa !59 + %_M_string_length.i.i2.i373 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 1 + store i64 %51, i64* %_M_string_length.i.i2.i373, align 8, !tbaa !59, !alias.scope !86 + %52 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i362 to %union.anon** + store %union.anon* %49, %union.anon** %52, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i372, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i364, align 1, !tbaa !42 + %_M_p.i.i.i.i375 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 0, i32 0 + %53 = load i8*, i8** %_M_p.i.i.i.i375, align 8, !tbaa !62 + %cmp.i.i.i377 = icmp eq i8* %53, %42 + br i1 %cmp.i.i.i377, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379, label %if.then.i.i378 + +if.then.i.i378: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374 + call void @_ZdlPv(i8* %53) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374, %if.then.i.i378 + call void @llvm.lifetime.end(i64 32, i8* nonnull %39) #2 %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 0, i32 0 - %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !56 - %call7 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %54, i32 0, i32 64, i32 3, i32 3, i32 3) + %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %call7 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %54, i32 0, i64 64, i64 3, i64 3, i64 3) %55 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %55) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %55) #2 %56 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp8 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %56) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %56) #2 %57 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 2 %58 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp8 to %union.anon** - store %union.anon* %57, %union.anon** %58, align 8, !tbaa !52 + store %union.anon* %57, %union.anon** %58, align 8, !tbaa !58 %59 = bitcast %union.anon* %57 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %59, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.27, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i395 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i395, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %59, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.45, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i403 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i403, align 8, !tbaa !59 %60 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %60, align 2, !tbaa !36 - %61 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !71 - %62 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !71 - %call3.i.i.i400 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp8, i64 0, i64 0, i8* %62, i64 %61) #7, !noalias !71 + store i8 0, i8* %60, align 2, !tbaa !42 + %61 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !89 + %62 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !89 + %call3.i.i.i408 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp8, i64 0, i64 0, i8* %62, i64 %61) #2, !noalias !89 %63 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 2 %64 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_b_path to %union.anon** - store %union.anon* %63, %union.anon** %64, align 8, !tbaa !52, !alias.scope !71 - %_M_p.i.i23.i.i401 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 0, i32 0 - %65 = load i8*, i8** %_M_p.i.i23.i.i401, align 8, !tbaa !56 - %66 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 2 - %arraydecay.i.i.i.i402 = bitcast %union.anon* %66 to i8* - %cmp.i.i.i403 = icmp eq i8* %65, %arraydecay.i.i.i.i402 - br i1 %cmp.i.i.i403, label %if.then.i.i405, label %if.else.i.i409 - -if.then.i.i405: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371 - %arraydecay.i.i.i404 = bitcast %union.anon* %63 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i404, i8* %65, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412 - -if.else.i.i409: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371 - %_M_p.i21.i.i406 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 - store i8* %65, i8** %_M_p.i21.i.i406, align 8, !tbaa !56, !alias.scope !71 - %_M_allocated_capacity.i.i407 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 2, i32 0 - %67 = load i64, i64* %_M_allocated_capacity.i.i407, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i408 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 2, i32 0 - store i64 %67, i64* %_M_allocated_capacity.i.i.i408, align 8, !tbaa !12, !alias.scope !71 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412: ; preds = %if.then.i.i405, %if.else.i.i409 - %_M_string_length.i20.i.i410 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 1 - %68 = load i64, i64* %_M_string_length.i20.i.i410, align 8, !tbaa !53 - %_M_string_length.i.i2.i411 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 1 - store i64 %68, i64* %_M_string_length.i.i2.i411, align 8, !tbaa !53, !alias.scope !71 - %69 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i400 to %union.anon** - store %union.anon* %66, %union.anon** %69, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i410, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i402, align 1, !tbaa !36 - %_M_p.i.i.i.i413 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 0, i32 0 - %70 = load i8*, i8** %_M_p.i.i.i.i413, align 8, !tbaa !56 - %cmp.i.i.i415 = icmp eq i8* %70, %59 - br i1 %cmp.i.i.i415, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417, label %if.then.i.i416 - -if.then.i.i416: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412 - call void @_ZdlPv(i8* %70) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412, %if.then.i.i416 - call void @llvm.lifetime.end(i64 32, i8* nonnull %56) #7 - %_M_p.i.i418 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 - %71 = load i8*, i8** %_M_p.i.i418, align 8, !tbaa !56 - %call11 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %71, i32 0, i32 1, i32 64, i32 1, i32 1) + store %union.anon* %63, %union.anon** %64, align 8, !tbaa !58, !alias.scope !89 + %_M_p.i.i23.i.i409 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 0, i32 0 + %65 = load i8*, i8** %_M_p.i.i23.i.i409, align 8, !tbaa !62 + %66 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 2 + %arraydecay.i.i.i.i410 = bitcast %union.anon* %66 to i8* + %cmp.i.i.i411 = icmp eq i8* %65, %arraydecay.i.i.i.i410 + br i1 %cmp.i.i.i411, label %if.then.i.i413, label %if.else.i.i417 + +if.then.i.i413: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379 + %arraydecay.i.i.i412 = bitcast %union.anon* %63 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i412, i8* %65, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420 + +if.else.i.i417: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379 + %_M_p.i21.i.i414 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 + store i8* %65, i8** %_M_p.i21.i.i414, align 8, !tbaa !62, !alias.scope !89 + %_M_allocated_capacity.i.i415 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 2, i32 0 + %67 = load i64, i64* %_M_allocated_capacity.i.i415, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i416 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 2, i32 0 + store i64 %67, i64* %_M_allocated_capacity.i.i.i416, align 8, !tbaa !15, !alias.scope !89 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420: ; preds = %if.then.i.i413, %if.else.i.i417 + %_M_string_length.i20.i.i418 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 1 + %68 = load i64, i64* %_M_string_length.i20.i.i418, align 8, !tbaa !59 + %_M_string_length.i.i2.i419 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 1 + store i64 %68, i64* %_M_string_length.i.i2.i419, align 8, !tbaa !59, !alias.scope !89 + %69 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i408 to %union.anon** + store %union.anon* %66, %union.anon** %69, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i418, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i410, align 1, !tbaa !42 + %_M_p.i.i.i.i421 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 0, i32 0 + %70 = load i8*, i8** %_M_p.i.i.i.i421, align 8, !tbaa !62 + %cmp.i.i.i423 = icmp eq i8* %70, %59 + br i1 %cmp.i.i.i423, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425, label %if.then.i.i424 + +if.then.i.i424: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420 + call void @_ZdlPv(i8* %70) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420, %if.then.i.i424 + call void @llvm.lifetime.end(i64 32, i8* nonnull %56) #2 + %_M_p.i.i426 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 + %71 = load i8*, i8** %_M_p.i.i426, align 8, !tbaa !62 + %call11 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %71, i32 0, i64 1, i64 64, i64 1, i64 1) %72 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %72) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %72) #2 %73 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp12 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %73) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %73) #2 %74 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 2 %75 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp12 to %union.anon** - store %union.anon* %74, %union.anon** %75, align 8, !tbaa !52 + store %union.anon* %74, %union.anon** %75, align 8, !tbaa !58 %76 = bitcast %union.anon* %74 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %76, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.28, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i442 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i442, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %76, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.46, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i450 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i450, align 8, !tbaa !59 %77 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %77, align 2, !tbaa !36 - %78 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !74 - %79 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !74 - %call3.i.i.i447 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp12, i64 0, i64 0, i8* %79, i64 %78) #7, !noalias !74 + store i8 0, i8* %77, align 2, !tbaa !42 + %78 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !92 + %79 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !92 + %call3.i.i.i455 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp12, i64 0, i64 0, i8* %79, i64 %78) #2, !noalias !92 %80 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 2 %81 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_w_path to %union.anon** - store %union.anon* %80, %union.anon** %81, align 8, !tbaa !52, !alias.scope !74 - %_M_p.i.i23.i.i448 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 0, i32 0 - %82 = load i8*, i8** %_M_p.i.i23.i.i448, align 8, !tbaa !56 - %83 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 2 - %arraydecay.i.i.i.i449 = bitcast %union.anon* %83 to i8* - %cmp.i.i.i450 = icmp eq i8* %82, %arraydecay.i.i.i.i449 - br i1 %cmp.i.i.i450, label %if.then.i.i452, label %if.else.i.i456 - -if.then.i.i452: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417 - %arraydecay.i.i.i451 = bitcast %union.anon* %80 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i451, i8* %82, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459 - -if.else.i.i456: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417 - %_M_p.i21.i.i453 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 - store i8* %82, i8** %_M_p.i21.i.i453, align 8, !tbaa !56, !alias.scope !74 - %_M_allocated_capacity.i.i454 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 2, i32 0 - %84 = load i64, i64* %_M_allocated_capacity.i.i454, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i455 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 2, i32 0 - store i64 %84, i64* %_M_allocated_capacity.i.i.i455, align 8, !tbaa !12, !alias.scope !74 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459: ; preds = %if.then.i.i452, %if.else.i.i456 - %_M_string_length.i20.i.i457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 1 - %85 = load i64, i64* %_M_string_length.i20.i.i457, align 8, !tbaa !53 - %_M_string_length.i.i2.i458 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 1 - store i64 %85, i64* %_M_string_length.i.i2.i458, align 8, !tbaa !53, !alias.scope !74 - %86 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i447 to %union.anon** - store %union.anon* %83, %union.anon** %86, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i457, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i449, align 1, !tbaa !36 - %_M_p.i.i.i.i460 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 0, i32 0 - %87 = load i8*, i8** %_M_p.i.i.i.i460, align 8, !tbaa !56 - %cmp.i.i.i462 = icmp eq i8* %87, %76 - br i1 %cmp.i.i.i462, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464, label %if.then.i.i463 - -if.then.i.i463: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459 - call void @_ZdlPv(i8* %87) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459, %if.then.i.i463 - call void @llvm.lifetime.end(i64 32, i8* nonnull %73) #7 - %_M_p.i.i465 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 - %88 = load i8*, i8** %_M_p.i.i465, align 8, !tbaa !56 - %call15 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %88, i32 0, i32 64, i32 64, i32 3, i32 3) + store %union.anon* %80, %union.anon** %81, align 8, !tbaa !58, !alias.scope !92 + %_M_p.i.i23.i.i456 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 0, i32 0 + %82 = load i8*, i8** %_M_p.i.i23.i.i456, align 8, !tbaa !62 + %83 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 2 + %arraydecay.i.i.i.i457 = bitcast %union.anon* %83 to i8* + %cmp.i.i.i458 = icmp eq i8* %82, %arraydecay.i.i.i.i457 + br i1 %cmp.i.i.i458, label %if.then.i.i460, label %if.else.i.i464 + +if.then.i.i460: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425 + %arraydecay.i.i.i459 = bitcast %union.anon* %80 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i459, i8* %82, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467 + +if.else.i.i464: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425 + %_M_p.i21.i.i461 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 + store i8* %82, i8** %_M_p.i21.i.i461, align 8, !tbaa !62, !alias.scope !92 + %_M_allocated_capacity.i.i462 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 2, i32 0 + %84 = load i64, i64* %_M_allocated_capacity.i.i462, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i463 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 2, i32 0 + store i64 %84, i64* %_M_allocated_capacity.i.i.i463, align 8, !tbaa !15, !alias.scope !92 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467: ; preds = %if.then.i.i460, %if.else.i.i464 + %_M_string_length.i20.i.i465 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 1 + %85 = load i64, i64* %_M_string_length.i20.i.i465, align 8, !tbaa !59 + %_M_string_length.i.i2.i466 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 1 + store i64 %85, i64* %_M_string_length.i.i2.i466, align 8, !tbaa !59, !alias.scope !92 + %86 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i455 to %union.anon** + store %union.anon* %83, %union.anon** %86, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i465, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i457, align 1, !tbaa !42 + %_M_p.i.i.i.i468 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 0, i32 0 + %87 = load i8*, i8** %_M_p.i.i.i.i468, align 8, !tbaa !62 + %cmp.i.i.i470 = icmp eq i8* %87, %76 + br i1 %cmp.i.i.i470, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472, label %if.then.i.i471 + +if.then.i.i471: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467 + call void @_ZdlPv(i8* %87) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467, %if.then.i.i471 + call void @llvm.lifetime.end(i64 32, i8* nonnull %73) #2 + %_M_p.i.i473 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 + %88 = load i8*, i8** %_M_p.i.i473, align 8, !tbaa !62 + %call15 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %88, i32 0, i64 64, i64 64, i64 3, i64 3) %89 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %89) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %89) #2 %90 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp16 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %90) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %90) #2 %91 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 2 %92 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp16 to %union.anon** - store %union.anon* %91, %union.anon** %92, align 8, !tbaa !52 + store %union.anon* %91, %union.anon** %92, align 8, !tbaa !58 %93 = bitcast %union.anon* %91 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %93, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.29, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i489 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i489, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %93, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.47, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i497 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i497, align 8, !tbaa !59 %94 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %94, align 2, !tbaa !36 - %95 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !77 - %96 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !77 - %call3.i.i.i494 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp16, i64 0, i64 0, i8* %96, i64 %95) #7, !noalias !77 + store i8 0, i8* %94, align 2, !tbaa !42 + %95 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !95 + %96 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !95 + %call3.i.i.i502 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp16, i64 0, i64 0, i8* %96, i64 %95) #2, !noalias !95 %97 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 2 %98 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_b_path to %union.anon** - store %union.anon* %97, %union.anon** %98, align 8, !tbaa !52, !alias.scope !77 - %_M_p.i.i23.i.i495 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 0, i32 0 - %99 = load i8*, i8** %_M_p.i.i23.i.i495, align 8, !tbaa !56 - %100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 2 - %arraydecay.i.i.i.i496 = bitcast %union.anon* %100 to i8* - %cmp.i.i.i497 = icmp eq i8* %99, %arraydecay.i.i.i.i496 - br i1 %cmp.i.i.i497, label %if.then.i.i499, label %if.else.i.i503 - -if.then.i.i499: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464 - %arraydecay.i.i.i498 = bitcast %union.anon* %97 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i498, i8* %99, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506 - -if.else.i.i503: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464 - %_M_p.i21.i.i500 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 - store i8* %99, i8** %_M_p.i21.i.i500, align 8, !tbaa !56, !alias.scope !77 - %_M_allocated_capacity.i.i501 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 2, i32 0 - %101 = load i64, i64* %_M_allocated_capacity.i.i501, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i502 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 2, i32 0 - store i64 %101, i64* %_M_allocated_capacity.i.i.i502, align 8, !tbaa !12, !alias.scope !77 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506: ; preds = %if.then.i.i499, %if.else.i.i503 - %_M_string_length.i20.i.i504 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 1 - %102 = load i64, i64* %_M_string_length.i20.i.i504, align 8, !tbaa !53 - %_M_string_length.i.i2.i505 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 1 - store i64 %102, i64* %_M_string_length.i.i2.i505, align 8, !tbaa !53, !alias.scope !77 - %103 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i494 to %union.anon** - store %union.anon* %100, %union.anon** %103, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i504, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i496, align 1, !tbaa !36 - %_M_p.i.i.i.i507 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 0, i32 0 - %104 = load i8*, i8** %_M_p.i.i.i.i507, align 8, !tbaa !56 - %cmp.i.i.i509 = icmp eq i8* %104, %93 - br i1 %cmp.i.i.i509, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511, label %if.then.i.i510 - -if.then.i.i510: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506 - call void @_ZdlPv(i8* %104) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506, %if.then.i.i510 - call void @llvm.lifetime.end(i64 32, i8* nonnull %90) #7 - %_M_p.i.i512 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 - %105 = load i8*, i8** %_M_p.i.i512, align 8, !tbaa !56 - %call19 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %105, i32 0, i32 1, i32 64, i32 1, i32 1) + store %union.anon* %97, %union.anon** %98, align 8, !tbaa !58, !alias.scope !95 + %_M_p.i.i23.i.i503 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 0, i32 0 + %99 = load i8*, i8** %_M_p.i.i23.i.i503, align 8, !tbaa !62 + %100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 2 + %arraydecay.i.i.i.i504 = bitcast %union.anon* %100 to i8* + %cmp.i.i.i505 = icmp eq i8* %99, %arraydecay.i.i.i.i504 + br i1 %cmp.i.i.i505, label %if.then.i.i507, label %if.else.i.i511 + +if.then.i.i507: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472 + %arraydecay.i.i.i506 = bitcast %union.anon* %97 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i506, i8* %99, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514 + +if.else.i.i511: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472 + %_M_p.i21.i.i508 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 + store i8* %99, i8** %_M_p.i21.i.i508, align 8, !tbaa !62, !alias.scope !95 + %_M_allocated_capacity.i.i509 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 2, i32 0 + %101 = load i64, i64* %_M_allocated_capacity.i.i509, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i510 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 2, i32 0 + store i64 %101, i64* %_M_allocated_capacity.i.i.i510, align 8, !tbaa !15, !alias.scope !95 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514: ; preds = %if.then.i.i507, %if.else.i.i511 + %_M_string_length.i20.i.i512 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 1 + %102 = load i64, i64* %_M_string_length.i20.i.i512, align 8, !tbaa !59 + %_M_string_length.i.i2.i513 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 1 + store i64 %102, i64* %_M_string_length.i.i2.i513, align 8, !tbaa !59, !alias.scope !95 + %103 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i502 to %union.anon** + store %union.anon* %100, %union.anon** %103, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i512, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i504, align 1, !tbaa !42 + %_M_p.i.i.i.i515 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 0, i32 0 + %104 = load i8*, i8** %_M_p.i.i.i.i515, align 8, !tbaa !62 + %cmp.i.i.i517 = icmp eq i8* %104, %93 + br i1 %cmp.i.i.i517, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519, label %if.then.i.i518 + +if.then.i.i518: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514 + call void @_ZdlPv(i8* %104) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514, %if.then.i.i518 + call void @llvm.lifetime.end(i64 32, i8* nonnull %90) #2 + %_M_p.i.i520 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 + %105 = load i8*, i8** %_M_p.i.i520, align 8, !tbaa !62 + %call19 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %105, i32 0, i64 1, i64 64, i64 1, i64 1) %106 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %106) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %106) #2 %107 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp20 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %107) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %107) #2 %108 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 2 %109 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp20 to %union.anon** - store %union.anon* %108, %union.anon** %109, align 8, !tbaa !52 + store %union.anon* %108, %union.anon** %109, align 8, !tbaa !58 %110 = bitcast %union.anon* %108 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %110, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.30, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i536 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i536, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %110, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.48, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i544 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i544, align 8, !tbaa !59 %111 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %111, align 2, !tbaa !36 - %112 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !80 - %113 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !80 - %call3.i.i.i541 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp20, i64 0, i64 0, i8* %113, i64 %112) #7, !noalias !80 + store i8 0, i8* %111, align 2, !tbaa !42 + %112 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !98 + %113 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !98 + %call3.i.i.i549 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp20, i64 0, i64 0, i8* %113, i64 %112) #2, !noalias !98 %114 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 2 %115 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_w_path to %union.anon** - store %union.anon* %114, %union.anon** %115, align 8, !tbaa !52, !alias.scope !80 - %_M_p.i.i23.i.i542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 0, i32 0 - %116 = load i8*, i8** %_M_p.i.i23.i.i542, align 8, !tbaa !56 - %117 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 2 - %arraydecay.i.i.i.i543 = bitcast %union.anon* %117 to i8* - %cmp.i.i.i544 = icmp eq i8* %116, %arraydecay.i.i.i.i543 - br i1 %cmp.i.i.i544, label %if.then.i.i546, label %if.else.i.i550 - -if.then.i.i546: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511 - %arraydecay.i.i.i545 = bitcast %union.anon* %114 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i545, i8* %116, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553 - -if.else.i.i550: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511 - %_M_p.i21.i.i547 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 - store i8* %116, i8** %_M_p.i21.i.i547, align 8, !tbaa !56, !alias.scope !80 - %_M_allocated_capacity.i.i548 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 2, i32 0 - %118 = load i64, i64* %_M_allocated_capacity.i.i548, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i549 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 2, i32 0 - store i64 %118, i64* %_M_allocated_capacity.i.i.i549, align 8, !tbaa !12, !alias.scope !80 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553: ; preds = %if.then.i.i546, %if.else.i.i550 - %_M_string_length.i20.i.i551 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 1 - %119 = load i64, i64* %_M_string_length.i20.i.i551, align 8, !tbaa !53 - %_M_string_length.i.i2.i552 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 1 - store i64 %119, i64* %_M_string_length.i.i2.i552, align 8, !tbaa !53, !alias.scope !80 - %120 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i541 to %union.anon** - store %union.anon* %117, %union.anon** %120, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i551, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i543, align 1, !tbaa !36 - %_M_p.i.i.i.i554 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 0, i32 0 - %121 = load i8*, i8** %_M_p.i.i.i.i554, align 8, !tbaa !56 - %cmp.i.i.i556 = icmp eq i8* %121, %110 - br i1 %cmp.i.i.i556, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558, label %if.then.i.i557 - -if.then.i.i557: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553 - call void @_ZdlPv(i8* %121) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553, %if.then.i.i557 - call void @llvm.lifetime.end(i64 32, i8* nonnull %107) #7 - %_M_p.i.i559 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 - %122 = load i8*, i8** %_M_p.i.i559, align 8, !tbaa !56 - %call23 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %122, i32 0, i32 128, i32 64, i32 3, i32 3) + store %union.anon* %114, %union.anon** %115, align 8, !tbaa !58, !alias.scope !98 + %_M_p.i.i23.i.i550 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 0, i32 0 + %116 = load i8*, i8** %_M_p.i.i23.i.i550, align 8, !tbaa !62 + %117 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 2 + %arraydecay.i.i.i.i551 = bitcast %union.anon* %117 to i8* + %cmp.i.i.i552 = icmp eq i8* %116, %arraydecay.i.i.i.i551 + br i1 %cmp.i.i.i552, label %if.then.i.i554, label %if.else.i.i558 + +if.then.i.i554: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519 + %arraydecay.i.i.i553 = bitcast %union.anon* %114 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i553, i8* %116, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561 + +if.else.i.i558: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519 + %_M_p.i21.i.i555 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 + store i8* %116, i8** %_M_p.i21.i.i555, align 8, !tbaa !62, !alias.scope !98 + %_M_allocated_capacity.i.i556 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 2, i32 0 + %118 = load i64, i64* %_M_allocated_capacity.i.i556, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i557 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 2, i32 0 + store i64 %118, i64* %_M_allocated_capacity.i.i.i557, align 8, !tbaa !15, !alias.scope !98 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561: ; preds = %if.then.i.i554, %if.else.i.i558 + %_M_string_length.i20.i.i559 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 1 + %119 = load i64, i64* %_M_string_length.i20.i.i559, align 8, !tbaa !59 + %_M_string_length.i.i2.i560 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 1 + store i64 %119, i64* %_M_string_length.i.i2.i560, align 8, !tbaa !59, !alias.scope !98 + %120 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i549 to %union.anon** + store %union.anon* %117, %union.anon** %120, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i559, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i551, align 1, !tbaa !42 + %_M_p.i.i.i.i562 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 0, i32 0 + %121 = load i8*, i8** %_M_p.i.i.i.i562, align 8, !tbaa !62 + %cmp.i.i.i564 = icmp eq i8* %121, %110 + br i1 %cmp.i.i.i564, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566, label %if.then.i.i565 + +if.then.i.i565: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561 + call void @_ZdlPv(i8* %121) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561, %if.then.i.i565 + call void @llvm.lifetime.end(i64 32, i8* nonnull %107) #2 + %_M_p.i.i567 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 + %122 = load i8*, i8** %_M_p.i.i567, align 8, !tbaa !62 + %call23 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %122, i32 0, i64 128, i64 64, i64 3, i64 3) %123 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %123) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %123) #2 %124 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp24 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %124) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %124) #2 %125 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 2 %126 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp24 to %union.anon** - store %union.anon* %125, %union.anon** %126, align 8, !tbaa !52 + store %union.anon* %125, %union.anon** %126, align 8, !tbaa !58 %127 = bitcast %union.anon* %125 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %127, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.31, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i583 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i583, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %127, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.49, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i591 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i591, align 8, !tbaa !59 %128 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %128, align 2, !tbaa !36 - %129 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !83 - %130 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !83 - %call3.i.i.i588 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp24, i64 0, i64 0, i8* %130, i64 %129) #7, !noalias !83 + store i8 0, i8* %128, align 2, !tbaa !42 + %129 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !101 + %130 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !101 + %call3.i.i.i596 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp24, i64 0, i64 0, i8* %130, i64 %129) #2, !noalias !101 %131 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 2 %132 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_b_path to %union.anon** - store %union.anon* %131, %union.anon** %132, align 8, !tbaa !52, !alias.scope !83 - %_M_p.i.i23.i.i589 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 0, i32 0 - %133 = load i8*, i8** %_M_p.i.i23.i.i589, align 8, !tbaa !56 - %134 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 2 - %arraydecay.i.i.i.i590 = bitcast %union.anon* %134 to i8* - %cmp.i.i.i591 = icmp eq i8* %133, %arraydecay.i.i.i.i590 - br i1 %cmp.i.i.i591, label %if.then.i.i593, label %if.else.i.i597 - -if.then.i.i593: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558 - %arraydecay.i.i.i592 = bitcast %union.anon* %131 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i592, i8* %133, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600 - -if.else.i.i597: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558 - %_M_p.i21.i.i594 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 - store i8* %133, i8** %_M_p.i21.i.i594, align 8, !tbaa !56, !alias.scope !83 - %_M_allocated_capacity.i.i595 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 2, i32 0 - %135 = load i64, i64* %_M_allocated_capacity.i.i595, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i596 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 2, i32 0 - store i64 %135, i64* %_M_allocated_capacity.i.i.i596, align 8, !tbaa !12, !alias.scope !83 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600: ; preds = %if.then.i.i593, %if.else.i.i597 - %_M_string_length.i20.i.i598 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 1 - %136 = load i64, i64* %_M_string_length.i20.i.i598, align 8, !tbaa !53 - %_M_string_length.i.i2.i599 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 1 - store i64 %136, i64* %_M_string_length.i.i2.i599, align 8, !tbaa !53, !alias.scope !83 - %137 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i588 to %union.anon** - store %union.anon* %134, %union.anon** %137, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i598, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i590, align 1, !tbaa !36 - %_M_p.i.i.i.i601 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 0, i32 0 - %138 = load i8*, i8** %_M_p.i.i.i.i601, align 8, !tbaa !56 - %cmp.i.i.i603 = icmp eq i8* %138, %127 - br i1 %cmp.i.i.i603, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605, label %if.then.i.i604 - -if.then.i.i604: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600 - call void @_ZdlPv(i8* %138) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600, %if.then.i.i604 - call void @llvm.lifetime.end(i64 32, i8* nonnull %124) #7 - %_M_p.i.i606 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 - %139 = load i8*, i8** %_M_p.i.i606, align 8, !tbaa !56 - %call27 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %139, i32 0, i32 1, i32 128, i32 1, i32 1) + store %union.anon* %131, %union.anon** %132, align 8, !tbaa !58, !alias.scope !101 + %_M_p.i.i23.i.i597 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 0, i32 0 + %133 = load i8*, i8** %_M_p.i.i23.i.i597, align 8, !tbaa !62 + %134 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 2 + %arraydecay.i.i.i.i598 = bitcast %union.anon* %134 to i8* + %cmp.i.i.i599 = icmp eq i8* %133, %arraydecay.i.i.i.i598 + br i1 %cmp.i.i.i599, label %if.then.i.i601, label %if.else.i.i605 + +if.then.i.i601: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566 + %arraydecay.i.i.i600 = bitcast %union.anon* %131 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i600, i8* %133, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608 + +if.else.i.i605: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566 + %_M_p.i21.i.i602 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 + store i8* %133, i8** %_M_p.i21.i.i602, align 8, !tbaa !62, !alias.scope !101 + %_M_allocated_capacity.i.i603 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 2, i32 0 + %135 = load i64, i64* %_M_allocated_capacity.i.i603, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i604 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 2, i32 0 + store i64 %135, i64* %_M_allocated_capacity.i.i.i604, align 8, !tbaa !15, !alias.scope !101 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608: ; preds = %if.then.i.i601, %if.else.i.i605 + %_M_string_length.i20.i.i606 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 1 + %136 = load i64, i64* %_M_string_length.i20.i.i606, align 8, !tbaa !59 + %_M_string_length.i.i2.i607 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 1 + store i64 %136, i64* %_M_string_length.i.i2.i607, align 8, !tbaa !59, !alias.scope !101 + %137 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i596 to %union.anon** + store %union.anon* %134, %union.anon** %137, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i606, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i598, align 1, !tbaa !42 + %_M_p.i.i.i.i609 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 0, i32 0 + %138 = load i8*, i8** %_M_p.i.i.i.i609, align 8, !tbaa !62 + %cmp.i.i.i611 = icmp eq i8* %138, %127 + br i1 %cmp.i.i.i611, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613, label %if.then.i.i612 + +if.then.i.i612: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608 + call void @_ZdlPv(i8* %138) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608, %if.then.i.i612 + call void @llvm.lifetime.end(i64 32, i8* nonnull %124) #2 + %_M_p.i.i614 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 + %139 = load i8*, i8** %_M_p.i.i614, align 8, !tbaa !62 + %call27 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %139, i32 0, i64 1, i64 128, i64 1, i64 1) %140 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %140) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %140) #2 %141 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp28 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %141) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %141) #2 %142 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 2 %143 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp28 to %union.anon** - store %union.anon* %142, %union.anon** %143, align 8, !tbaa !52 + store %union.anon* %142, %union.anon** %143, align 8, !tbaa !58 %144 = bitcast %union.anon* %142 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %144, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.32, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i630 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i630, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %144, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.50, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i638 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i638, align 8, !tbaa !59 %145 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %145, align 2, !tbaa !36 - %146 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !86 - %147 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !86 - %call3.i.i.i635 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp28, i64 0, i64 0, i8* %147, i64 %146) #7, !noalias !86 + store i8 0, i8* %145, align 2, !tbaa !42 + %146 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !104 + %147 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !104 + %call3.i.i.i643 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp28, i64 0, i64 0, i8* %147, i64 %146) #2, !noalias !104 %148 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 2 %149 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_w_path to %union.anon** - store %union.anon* %148, %union.anon** %149, align 8, !tbaa !52, !alias.scope !86 - %_M_p.i.i23.i.i636 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 0, i32 0 - %150 = load i8*, i8** %_M_p.i.i23.i.i636, align 8, !tbaa !56 - %151 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 2 - %arraydecay.i.i.i.i637 = bitcast %union.anon* %151 to i8* - %cmp.i.i.i638 = icmp eq i8* %150, %arraydecay.i.i.i.i637 - br i1 %cmp.i.i.i638, label %if.then.i.i640, label %if.else.i.i644 - -if.then.i.i640: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605 - %arraydecay.i.i.i639 = bitcast %union.anon* %148 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i639, i8* %150, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647 - -if.else.i.i644: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605 - %_M_p.i21.i.i641 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 - store i8* %150, i8** %_M_p.i21.i.i641, align 8, !tbaa !56, !alias.scope !86 - %_M_allocated_capacity.i.i642 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 2, i32 0 - %152 = load i64, i64* %_M_allocated_capacity.i.i642, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i643 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 2, i32 0 - store i64 %152, i64* %_M_allocated_capacity.i.i.i643, align 8, !tbaa !12, !alias.scope !86 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647: ; preds = %if.then.i.i640, %if.else.i.i644 - %_M_string_length.i20.i.i645 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 1 - %153 = load i64, i64* %_M_string_length.i20.i.i645, align 8, !tbaa !53 - %_M_string_length.i.i2.i646 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 1 - store i64 %153, i64* %_M_string_length.i.i2.i646, align 8, !tbaa !53, !alias.scope !86 - %154 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i635 to %union.anon** - store %union.anon* %151, %union.anon** %154, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i645, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i637, align 1, !tbaa !36 - %_M_p.i.i.i.i648 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 0, i32 0 - %155 = load i8*, i8** %_M_p.i.i.i.i648, align 8, !tbaa !56 - %cmp.i.i.i650 = icmp eq i8* %155, %144 - br i1 %cmp.i.i.i650, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652, label %if.then.i.i651 - -if.then.i.i651: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647 - call void @_ZdlPv(i8* %155) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647, %if.then.i.i651 - call void @llvm.lifetime.end(i64 32, i8* nonnull %141) #7 - %_M_p.i.i653 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 - %156 = load i8*, i8** %_M_p.i.i653, align 8, !tbaa !56 - %call31 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %156, i32 0, i32 128, i32 128, i32 3, i32 3) + store %union.anon* %148, %union.anon** %149, align 8, !tbaa !58, !alias.scope !104 + %_M_p.i.i23.i.i644 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 0, i32 0 + %150 = load i8*, i8** %_M_p.i.i23.i.i644, align 8, !tbaa !62 + %151 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 2 + %arraydecay.i.i.i.i645 = bitcast %union.anon* %151 to i8* + %cmp.i.i.i646 = icmp eq i8* %150, %arraydecay.i.i.i.i645 + br i1 %cmp.i.i.i646, label %if.then.i.i648, label %if.else.i.i652 + +if.then.i.i648: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613 + %arraydecay.i.i.i647 = bitcast %union.anon* %148 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i647, i8* %150, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655 + +if.else.i.i652: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613 + %_M_p.i21.i.i649 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 + store i8* %150, i8** %_M_p.i21.i.i649, align 8, !tbaa !62, !alias.scope !104 + %_M_allocated_capacity.i.i650 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 2, i32 0 + %152 = load i64, i64* %_M_allocated_capacity.i.i650, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i651 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 2, i32 0 + store i64 %152, i64* %_M_allocated_capacity.i.i.i651, align 8, !tbaa !15, !alias.scope !104 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655: ; preds = %if.then.i.i648, %if.else.i.i652 + %_M_string_length.i20.i.i653 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 1 + %153 = load i64, i64* %_M_string_length.i20.i.i653, align 8, !tbaa !59 + %_M_string_length.i.i2.i654 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 1 + store i64 %153, i64* %_M_string_length.i.i2.i654, align 8, !tbaa !59, !alias.scope !104 + %154 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i643 to %union.anon** + store %union.anon* %151, %union.anon** %154, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i653, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i645, align 1, !tbaa !42 + %_M_p.i.i.i.i656 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 0, i32 0 + %155 = load i8*, i8** %_M_p.i.i.i.i656, align 8, !tbaa !62 + %cmp.i.i.i658 = icmp eq i8* %155, %144 + br i1 %cmp.i.i.i658, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660, label %if.then.i.i659 + +if.then.i.i659: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655 + call void @_ZdlPv(i8* %155) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655, %if.then.i.i659 + call void @llvm.lifetime.end(i64 32, i8* nonnull %141) #2 + %_M_p.i.i661 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 + %156 = load i8*, i8** %_M_p.i.i661, align 8, !tbaa !62 + %call31 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %156, i32 0, i64 128, i64 128, i64 3, i64 3) %157 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %157) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %157) #2 %158 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp32 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %158) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %158) #2 %159 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 2 %160 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp32 to %union.anon** - store %union.anon* %159, %union.anon** %160, align 8, !tbaa !52 + store %union.anon* %159, %union.anon** %160, align 8, !tbaa !58 %161 = bitcast %union.anon* %159 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %161, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.33, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i677 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i677, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %161, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.51, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i685 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i685, align 8, !tbaa !59 %162 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %162, align 2, !tbaa !36 - %163 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !89 - %164 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !89 - %call3.i.i.i682 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp32, i64 0, i64 0, i8* %164, i64 %163) #7, !noalias !89 + store i8 0, i8* %162, align 2, !tbaa !42 + %163 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !107 + %164 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !107 + %call3.i.i.i690 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp32, i64 0, i64 0, i8* %164, i64 %163) #2, !noalias !107 %165 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 2 %166 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_b_path to %union.anon** - store %union.anon* %165, %union.anon** %166, align 8, !tbaa !52, !alias.scope !89 - %_M_p.i.i23.i.i683 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 0, i32 0 - %167 = load i8*, i8** %_M_p.i.i23.i.i683, align 8, !tbaa !56 - %168 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 2 - %arraydecay.i.i.i.i684 = bitcast %union.anon* %168 to i8* - %cmp.i.i.i685 = icmp eq i8* %167, %arraydecay.i.i.i.i684 - br i1 %cmp.i.i.i685, label %if.then.i.i687, label %if.else.i.i691 - -if.then.i.i687: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652 - %arraydecay.i.i.i686 = bitcast %union.anon* %165 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i686, i8* %167, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694 - -if.else.i.i691: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652 - %_M_p.i21.i.i688 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 - store i8* %167, i8** %_M_p.i21.i.i688, align 8, !tbaa !56, !alias.scope !89 - %_M_allocated_capacity.i.i689 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 2, i32 0 - %169 = load i64, i64* %_M_allocated_capacity.i.i689, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i690 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 2, i32 0 - store i64 %169, i64* %_M_allocated_capacity.i.i.i690, align 8, !tbaa !12, !alias.scope !89 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694: ; preds = %if.then.i.i687, %if.else.i.i691 - %_M_string_length.i20.i.i692 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 1 - %170 = load i64, i64* %_M_string_length.i20.i.i692, align 8, !tbaa !53 - %_M_string_length.i.i2.i693 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 1 - store i64 %170, i64* %_M_string_length.i.i2.i693, align 8, !tbaa !53, !alias.scope !89 - %171 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i682 to %union.anon** - store %union.anon* %168, %union.anon** %171, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i692, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i684, align 1, !tbaa !36 - %_M_p.i.i.i.i695 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 0, i32 0 - %172 = load i8*, i8** %_M_p.i.i.i.i695, align 8, !tbaa !56 - %cmp.i.i.i697 = icmp eq i8* %172, %161 - br i1 %cmp.i.i.i697, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699, label %if.then.i.i698 - -if.then.i.i698: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694 - call void @_ZdlPv(i8* %172) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694, %if.then.i.i698 - call void @llvm.lifetime.end(i64 32, i8* nonnull %158) #7 - %_M_p.i.i700 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 - %173 = load i8*, i8** %_M_p.i.i700, align 8, !tbaa !56 - %call35 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %173, i32 0, i32 1, i32 128, i32 1, i32 1) + store %union.anon* %165, %union.anon** %166, align 8, !tbaa !58, !alias.scope !107 + %_M_p.i.i23.i.i691 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 0, i32 0 + %167 = load i8*, i8** %_M_p.i.i23.i.i691, align 8, !tbaa !62 + %168 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 2 + %arraydecay.i.i.i.i692 = bitcast %union.anon* %168 to i8* + %cmp.i.i.i693 = icmp eq i8* %167, %arraydecay.i.i.i.i692 + br i1 %cmp.i.i.i693, label %if.then.i.i695, label %if.else.i.i699 + +if.then.i.i695: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660 + %arraydecay.i.i.i694 = bitcast %union.anon* %165 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i694, i8* %167, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702 + +if.else.i.i699: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660 + %_M_p.i21.i.i696 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 + store i8* %167, i8** %_M_p.i21.i.i696, align 8, !tbaa !62, !alias.scope !107 + %_M_allocated_capacity.i.i697 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 2, i32 0 + %169 = load i64, i64* %_M_allocated_capacity.i.i697, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i698 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 2, i32 0 + store i64 %169, i64* %_M_allocated_capacity.i.i.i698, align 8, !tbaa !15, !alias.scope !107 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702: ; preds = %if.then.i.i695, %if.else.i.i699 + %_M_string_length.i20.i.i700 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 1 + %170 = load i64, i64* %_M_string_length.i20.i.i700, align 8, !tbaa !59 + %_M_string_length.i.i2.i701 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 1 + store i64 %170, i64* %_M_string_length.i.i2.i701, align 8, !tbaa !59, !alias.scope !107 + %171 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i690 to %union.anon** + store %union.anon* %168, %union.anon** %171, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i700, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i692, align 1, !tbaa !42 + %_M_p.i.i.i.i703 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 0, i32 0 + %172 = load i8*, i8** %_M_p.i.i.i.i703, align 8, !tbaa !62 + %cmp.i.i.i705 = icmp eq i8* %172, %161 + br i1 %cmp.i.i.i705, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707, label %if.then.i.i706 + +if.then.i.i706: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702 + call void @_ZdlPv(i8* %172) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702, %if.then.i.i706 + call void @llvm.lifetime.end(i64 32, i8* nonnull %158) #2 + %_M_p.i.i708 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 + %173 = load i8*, i8** %_M_p.i.i708, align 8, !tbaa !62 + %call35 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %173, i32 0, i64 1, i64 128, i64 1, i64 1) %174 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %174) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %174) #2 %175 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp36 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %175) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %175) #2 %176 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 2 %177 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp36 to %union.anon** - store %union.anon* %176, %union.anon** %177, align 8, !tbaa !52 + store %union.anon* %176, %union.anon** %177, align 8, !tbaa !58 %178 = bitcast %union.anon* %176 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %178, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.34, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i724 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i724, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %178, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.52, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i732 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i732, align 8, !tbaa !59 %179 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %179, align 2, !tbaa !36 - %180 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !92 - %181 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !92 - %call3.i.i.i729 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp36, i64 0, i64 0, i8* %181, i64 %180) #7, !noalias !92 + store i8 0, i8* %179, align 2, !tbaa !42 + %180 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !110 + %181 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !110 + %call3.i.i.i737 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp36, i64 0, i64 0, i8* %181, i64 %180) #2, !noalias !110 %182 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 2 %183 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_w_path to %union.anon** - store %union.anon* %182, %union.anon** %183, align 8, !tbaa !52, !alias.scope !92 - %_M_p.i.i23.i.i730 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 0, i32 0 - %184 = load i8*, i8** %_M_p.i.i23.i.i730, align 8, !tbaa !56 - %185 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 2 - %arraydecay.i.i.i.i731 = bitcast %union.anon* %185 to i8* - %cmp.i.i.i732 = icmp eq i8* %184, %arraydecay.i.i.i.i731 - br i1 %cmp.i.i.i732, label %if.then.i.i734, label %if.else.i.i738 - -if.then.i.i734: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699 - %arraydecay.i.i.i733 = bitcast %union.anon* %182 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i733, i8* %184, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741 - -if.else.i.i738: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699 - %_M_p.i21.i.i735 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 - store i8* %184, i8** %_M_p.i21.i.i735, align 8, !tbaa !56, !alias.scope !92 - %_M_allocated_capacity.i.i736 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 2, i32 0 - %186 = load i64, i64* %_M_allocated_capacity.i.i736, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i737 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 2, i32 0 - store i64 %186, i64* %_M_allocated_capacity.i.i.i737, align 8, !tbaa !12, !alias.scope !92 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741: ; preds = %if.then.i.i734, %if.else.i.i738 - %_M_string_length.i20.i.i739 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 1 - %187 = load i64, i64* %_M_string_length.i20.i.i739, align 8, !tbaa !53 - %_M_string_length.i.i2.i740 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 1 - store i64 %187, i64* %_M_string_length.i.i2.i740, align 8, !tbaa !53, !alias.scope !92 - %188 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i729 to %union.anon** - store %union.anon* %185, %union.anon** %188, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i739, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i731, align 1, !tbaa !36 - %_M_p.i.i.i.i742 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 0, i32 0 - %189 = load i8*, i8** %_M_p.i.i.i.i742, align 8, !tbaa !56 - %cmp.i.i.i744 = icmp eq i8* %189, %178 - br i1 %cmp.i.i.i744, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746, label %if.then.i.i745 - -if.then.i.i745: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741 - call void @_ZdlPv(i8* %189) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741, %if.then.i.i745 - call void @llvm.lifetime.end(i64 32, i8* nonnull %175) #7 - %_M_p.i.i747 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 - %190 = load i8*, i8** %_M_p.i.i747, align 8, !tbaa !56 - %call39 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %190, i32 0, i32 256, i32 128, i32 3, i32 3) + store %union.anon* %182, %union.anon** %183, align 8, !tbaa !58, !alias.scope !110 + %_M_p.i.i23.i.i738 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 0, i32 0 + %184 = load i8*, i8** %_M_p.i.i23.i.i738, align 8, !tbaa !62 + %185 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 2 + %arraydecay.i.i.i.i739 = bitcast %union.anon* %185 to i8* + %cmp.i.i.i740 = icmp eq i8* %184, %arraydecay.i.i.i.i739 + br i1 %cmp.i.i.i740, label %if.then.i.i742, label %if.else.i.i746 + +if.then.i.i742: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707 + %arraydecay.i.i.i741 = bitcast %union.anon* %182 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i741, i8* %184, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749 + +if.else.i.i746: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707 + %_M_p.i21.i.i743 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 + store i8* %184, i8** %_M_p.i21.i.i743, align 8, !tbaa !62, !alias.scope !110 + %_M_allocated_capacity.i.i744 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 2, i32 0 + %186 = load i64, i64* %_M_allocated_capacity.i.i744, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i745 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 2, i32 0 + store i64 %186, i64* %_M_allocated_capacity.i.i.i745, align 8, !tbaa !15, !alias.scope !110 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749: ; preds = %if.then.i.i742, %if.else.i.i746 + %_M_string_length.i20.i.i747 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 1 + %187 = load i64, i64* %_M_string_length.i20.i.i747, align 8, !tbaa !59 + %_M_string_length.i.i2.i748 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 1 + store i64 %187, i64* %_M_string_length.i.i2.i748, align 8, !tbaa !59, !alias.scope !110 + %188 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i737 to %union.anon** + store %union.anon* %185, %union.anon** %188, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i747, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i739, align 1, !tbaa !42 + %_M_p.i.i.i.i750 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 0, i32 0 + %189 = load i8*, i8** %_M_p.i.i.i.i750, align 8, !tbaa !62 + %cmp.i.i.i752 = icmp eq i8* %189, %178 + br i1 %cmp.i.i.i752, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754, label %if.then.i.i753 + +if.then.i.i753: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749 + call void @_ZdlPv(i8* %189) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749, %if.then.i.i753 + call void @llvm.lifetime.end(i64 32, i8* nonnull %175) #2 + %_M_p.i.i755 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 + %190 = load i8*, i8** %_M_p.i.i755, align 8, !tbaa !62 + %call39 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %190, i32 0, i64 256, i64 128, i64 3, i64 3) %191 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %191) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %191) #2 %192 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp40 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %192) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %192) #2 %193 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 2 %194 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp40 to %union.anon** - store %union.anon* %193, %union.anon** %194, align 8, !tbaa !52 + store %union.anon* %193, %union.anon** %194, align 8, !tbaa !58 %195 = bitcast %union.anon* %193 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %195, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.35, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i771 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i771, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %195, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.53, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i779 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i779, align 8, !tbaa !59 %196 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %196, align 2, !tbaa !36 - %197 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !95 - %198 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !95 - %call3.i.i.i776 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp40, i64 0, i64 0, i8* %198, i64 %197) #7, !noalias !95 + store i8 0, i8* %196, align 2, !tbaa !42 + %197 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !113 + %198 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !113 + %call3.i.i.i784 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp40, i64 0, i64 0, i8* %198, i64 %197) #2, !noalias !113 %199 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 2 %200 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_b_path to %union.anon** - store %union.anon* %199, %union.anon** %200, align 8, !tbaa !52, !alias.scope !95 - %_M_p.i.i23.i.i777 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 0, i32 0 - %201 = load i8*, i8** %_M_p.i.i23.i.i777, align 8, !tbaa !56 - %202 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 2 - %arraydecay.i.i.i.i778 = bitcast %union.anon* %202 to i8* - %cmp.i.i.i779 = icmp eq i8* %201, %arraydecay.i.i.i.i778 - br i1 %cmp.i.i.i779, label %if.then.i.i781, label %if.else.i.i785 - -if.then.i.i781: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746 - %arraydecay.i.i.i780 = bitcast %union.anon* %199 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i780, i8* %201, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788 - -if.else.i.i785: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746 - %_M_p.i21.i.i782 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 - store i8* %201, i8** %_M_p.i21.i.i782, align 8, !tbaa !56, !alias.scope !95 - %_M_allocated_capacity.i.i783 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 2, i32 0 - %203 = load i64, i64* %_M_allocated_capacity.i.i783, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i784 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 2, i32 0 - store i64 %203, i64* %_M_allocated_capacity.i.i.i784, align 8, !tbaa !12, !alias.scope !95 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788: ; preds = %if.then.i.i781, %if.else.i.i785 - %_M_string_length.i20.i.i786 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 1 - %204 = load i64, i64* %_M_string_length.i20.i.i786, align 8, !tbaa !53 - %_M_string_length.i.i2.i787 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 1 - store i64 %204, i64* %_M_string_length.i.i2.i787, align 8, !tbaa !53, !alias.scope !95 - %205 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i776 to %union.anon** - store %union.anon* %202, %union.anon** %205, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i786, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i778, align 1, !tbaa !36 - %_M_p.i.i.i.i789 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 0, i32 0 - %206 = load i8*, i8** %_M_p.i.i.i.i789, align 8, !tbaa !56 - %cmp.i.i.i791 = icmp eq i8* %206, %195 - br i1 %cmp.i.i.i791, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793, label %if.then.i.i792 - -if.then.i.i792: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788 - call void @_ZdlPv(i8* %206) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788, %if.then.i.i792 - call void @llvm.lifetime.end(i64 32, i8* nonnull %192) #7 - %_M_p.i.i794 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 - %207 = load i8*, i8** %_M_p.i.i794, align 8, !tbaa !56 - %call43 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %207, i32 0, i32 1, i32 256, i32 1, i32 1) + store %union.anon* %199, %union.anon** %200, align 8, !tbaa !58, !alias.scope !113 + %_M_p.i.i23.i.i785 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 0, i32 0 + %201 = load i8*, i8** %_M_p.i.i23.i.i785, align 8, !tbaa !62 + %202 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 2 + %arraydecay.i.i.i.i786 = bitcast %union.anon* %202 to i8* + %cmp.i.i.i787 = icmp eq i8* %201, %arraydecay.i.i.i.i786 + br i1 %cmp.i.i.i787, label %if.then.i.i789, label %if.else.i.i793 + +if.then.i.i789: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754 + %arraydecay.i.i.i788 = bitcast %union.anon* %199 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i788, i8* %201, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796 + +if.else.i.i793: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754 + %_M_p.i21.i.i790 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 + store i8* %201, i8** %_M_p.i21.i.i790, align 8, !tbaa !62, !alias.scope !113 + %_M_allocated_capacity.i.i791 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 2, i32 0 + %203 = load i64, i64* %_M_allocated_capacity.i.i791, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i792 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 2, i32 0 + store i64 %203, i64* %_M_allocated_capacity.i.i.i792, align 8, !tbaa !15, !alias.scope !113 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796: ; preds = %if.then.i.i789, %if.else.i.i793 + %_M_string_length.i20.i.i794 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 1 + %204 = load i64, i64* %_M_string_length.i20.i.i794, align 8, !tbaa !59 + %_M_string_length.i.i2.i795 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 1 + store i64 %204, i64* %_M_string_length.i.i2.i795, align 8, !tbaa !59, !alias.scope !113 + %205 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i784 to %union.anon** + store %union.anon* %202, %union.anon** %205, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i794, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i786, align 1, !tbaa !42 + %_M_p.i.i.i.i797 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 0, i32 0 + %206 = load i8*, i8** %_M_p.i.i.i.i797, align 8, !tbaa !62 + %cmp.i.i.i799 = icmp eq i8* %206, %195 + br i1 %cmp.i.i.i799, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801, label %if.then.i.i800 + +if.then.i.i800: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796 + call void @_ZdlPv(i8* %206) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796, %if.then.i.i800 + call void @llvm.lifetime.end(i64 32, i8* nonnull %192) #2 + %_M_p.i.i802 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 + %207 = load i8*, i8** %_M_p.i.i802, align 8, !tbaa !62 + %call43 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %207, i32 0, i64 1, i64 256, i64 1, i64 1) %208 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %208) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %208) #2 %209 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp44 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %209) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %209) #2 %210 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 2 %211 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp44 to %union.anon** - store %union.anon* %210, %union.anon** %211, align 8, !tbaa !52 + store %union.anon* %210, %union.anon** %211, align 8, !tbaa !58 %212 = bitcast %union.anon* %210 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %212, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.36, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i818 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i818, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %212, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.54, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i826 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i826, align 8, !tbaa !59 %213 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %213, align 2, !tbaa !36 - %214 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !98 - %215 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !98 - %call3.i.i.i823 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp44, i64 0, i64 0, i8* %215, i64 %214) #7, !noalias !98 + store i8 0, i8* %213, align 2, !tbaa !42 + %214 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !116 + %215 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !116 + %call3.i.i.i831 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp44, i64 0, i64 0, i8* %215, i64 %214) #2, !noalias !116 %216 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 2 %217 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_w_path to %union.anon** - store %union.anon* %216, %union.anon** %217, align 8, !tbaa !52, !alias.scope !98 - %_M_p.i.i23.i.i824 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 0, i32 0 - %218 = load i8*, i8** %_M_p.i.i23.i.i824, align 8, !tbaa !56 - %219 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 2 - %arraydecay.i.i.i.i825 = bitcast %union.anon* %219 to i8* - %cmp.i.i.i826 = icmp eq i8* %218, %arraydecay.i.i.i.i825 - br i1 %cmp.i.i.i826, label %if.then.i.i828, label %if.else.i.i832 - -if.then.i.i828: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793 - %arraydecay.i.i.i827 = bitcast %union.anon* %216 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i827, i8* %218, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835 - -if.else.i.i832: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793 - %_M_p.i21.i.i829 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 - store i8* %218, i8** %_M_p.i21.i.i829, align 8, !tbaa !56, !alias.scope !98 - %_M_allocated_capacity.i.i830 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 2, i32 0 - %220 = load i64, i64* %_M_allocated_capacity.i.i830, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i831 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 2, i32 0 - store i64 %220, i64* %_M_allocated_capacity.i.i.i831, align 8, !tbaa !12, !alias.scope !98 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835: ; preds = %if.then.i.i828, %if.else.i.i832 - %_M_string_length.i20.i.i833 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 1 - %221 = load i64, i64* %_M_string_length.i20.i.i833, align 8, !tbaa !53 - %_M_string_length.i.i2.i834 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 1 - store i64 %221, i64* %_M_string_length.i.i2.i834, align 8, !tbaa !53, !alias.scope !98 - %222 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i823 to %union.anon** - store %union.anon* %219, %union.anon** %222, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i833, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i825, align 1, !tbaa !36 - %_M_p.i.i.i.i836 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 0, i32 0 - %223 = load i8*, i8** %_M_p.i.i.i.i836, align 8, !tbaa !56 - %cmp.i.i.i838 = icmp eq i8* %223, %212 - br i1 %cmp.i.i.i838, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840, label %if.then.i.i839 - -if.then.i.i839: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835 - call void @_ZdlPv(i8* %223) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835, %if.then.i.i839 - call void @llvm.lifetime.end(i64 32, i8* nonnull %209) #7 - %_M_p.i.i841 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 - %224 = load i8*, i8** %_M_p.i.i841, align 8, !tbaa !56 - %call47 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %224, i32 0, i32 256, i32 256, i32 3, i32 3) + store %union.anon* %216, %union.anon** %217, align 8, !tbaa !58, !alias.scope !116 + %_M_p.i.i23.i.i832 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 0, i32 0 + %218 = load i8*, i8** %_M_p.i.i23.i.i832, align 8, !tbaa !62 + %219 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 2 + %arraydecay.i.i.i.i833 = bitcast %union.anon* %219 to i8* + %cmp.i.i.i834 = icmp eq i8* %218, %arraydecay.i.i.i.i833 + br i1 %cmp.i.i.i834, label %if.then.i.i836, label %if.else.i.i840 + +if.then.i.i836: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801 + %arraydecay.i.i.i835 = bitcast %union.anon* %216 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i835, i8* %218, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843 + +if.else.i.i840: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801 + %_M_p.i21.i.i837 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 + store i8* %218, i8** %_M_p.i21.i.i837, align 8, !tbaa !62, !alias.scope !116 + %_M_allocated_capacity.i.i838 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 2, i32 0 + %220 = load i64, i64* %_M_allocated_capacity.i.i838, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i839 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 2, i32 0 + store i64 %220, i64* %_M_allocated_capacity.i.i.i839, align 8, !tbaa !15, !alias.scope !116 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843: ; preds = %if.then.i.i836, %if.else.i.i840 + %_M_string_length.i20.i.i841 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 1 + %221 = load i64, i64* %_M_string_length.i20.i.i841, align 8, !tbaa !59 + %_M_string_length.i.i2.i842 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 1 + store i64 %221, i64* %_M_string_length.i.i2.i842, align 8, !tbaa !59, !alias.scope !116 + %222 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i831 to %union.anon** + store %union.anon* %219, %union.anon** %222, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i841, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i833, align 1, !tbaa !42 + %_M_p.i.i.i.i844 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 0, i32 0 + %223 = load i8*, i8** %_M_p.i.i.i.i844, align 8, !tbaa !62 + %cmp.i.i.i846 = icmp eq i8* %223, %212 + br i1 %cmp.i.i.i846, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848, label %if.then.i.i847 + +if.then.i.i847: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843 + call void @_ZdlPv(i8* %223) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843, %if.then.i.i847 + call void @llvm.lifetime.end(i64 32, i8* nonnull %209) #2 + %_M_p.i.i849 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 + %224 = load i8*, i8** %_M_p.i.i849, align 8, !tbaa !62 + %call47 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %224, i32 0, i64 256, i64 256, i64 3, i64 3) %225 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %225) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %225) #2 %226 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp48 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %226) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %226) #2 %227 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 2 %228 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp48 to %union.anon** - store %union.anon* %227, %union.anon** %228, align 8, !tbaa !52 + store %union.anon* %227, %union.anon** %228, align 8, !tbaa !58 %229 = bitcast %union.anon* %227 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %229, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.37, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i865 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i865, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %229, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.55, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i873 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i873, align 8, !tbaa !59 %230 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %230, align 2, !tbaa !36 - %231 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !101 - %232 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !101 - %call3.i.i.i870 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp48, i64 0, i64 0, i8* %232, i64 %231) #7, !noalias !101 + store i8 0, i8* %230, align 2, !tbaa !42 + %231 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !119 + %232 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !119 + %call3.i.i.i878 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp48, i64 0, i64 0, i8* %232, i64 %231) #2, !noalias !119 %233 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 2 %234 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_b_path to %union.anon** - store %union.anon* %233, %union.anon** %234, align 8, !tbaa !52, !alias.scope !101 - %_M_p.i.i23.i.i871 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 0, i32 0 - %235 = load i8*, i8** %_M_p.i.i23.i.i871, align 8, !tbaa !56 - %236 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 2 - %arraydecay.i.i.i.i872 = bitcast %union.anon* %236 to i8* - %cmp.i.i.i873 = icmp eq i8* %235, %arraydecay.i.i.i.i872 - br i1 %cmp.i.i.i873, label %if.then.i.i875, label %if.else.i.i879 - -if.then.i.i875: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840 - %arraydecay.i.i.i874 = bitcast %union.anon* %233 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i874, i8* %235, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882 - -if.else.i.i879: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840 - %_M_p.i21.i.i876 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 - store i8* %235, i8** %_M_p.i21.i.i876, align 8, !tbaa !56, !alias.scope !101 - %_M_allocated_capacity.i.i877 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 2, i32 0 - %237 = load i64, i64* %_M_allocated_capacity.i.i877, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i878 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 2, i32 0 - store i64 %237, i64* %_M_allocated_capacity.i.i.i878, align 8, !tbaa !12, !alias.scope !101 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882: ; preds = %if.then.i.i875, %if.else.i.i879 - %_M_string_length.i20.i.i880 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 1 - %238 = load i64, i64* %_M_string_length.i20.i.i880, align 8, !tbaa !53 - %_M_string_length.i.i2.i881 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 1 - store i64 %238, i64* %_M_string_length.i.i2.i881, align 8, !tbaa !53, !alias.scope !101 - %239 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i870 to %union.anon** - store %union.anon* %236, %union.anon** %239, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i880, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i872, align 1, !tbaa !36 - %_M_p.i.i.i.i883 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 0, i32 0 - %240 = load i8*, i8** %_M_p.i.i.i.i883, align 8, !tbaa !56 - %cmp.i.i.i885 = icmp eq i8* %240, %229 - br i1 %cmp.i.i.i885, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887, label %if.then.i.i886 - -if.then.i.i886: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882 - call void @_ZdlPv(i8* %240) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882, %if.then.i.i886 - call void @llvm.lifetime.end(i64 32, i8* nonnull %226) #7 - %_M_p.i.i888 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 - %241 = load i8*, i8** %_M_p.i.i888, align 8, !tbaa !56 - %call51 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %241, i32 0, i32 1, i32 256, i32 1, i32 1) + store %union.anon* %233, %union.anon** %234, align 8, !tbaa !58, !alias.scope !119 + %_M_p.i.i23.i.i879 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 0, i32 0 + %235 = load i8*, i8** %_M_p.i.i23.i.i879, align 8, !tbaa !62 + %236 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 2 + %arraydecay.i.i.i.i880 = bitcast %union.anon* %236 to i8* + %cmp.i.i.i881 = icmp eq i8* %235, %arraydecay.i.i.i.i880 + br i1 %cmp.i.i.i881, label %if.then.i.i883, label %if.else.i.i887 + +if.then.i.i883: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848 + %arraydecay.i.i.i882 = bitcast %union.anon* %233 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i882, i8* %235, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890 + +if.else.i.i887: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848 + %_M_p.i21.i.i884 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 + store i8* %235, i8** %_M_p.i21.i.i884, align 8, !tbaa !62, !alias.scope !119 + %_M_allocated_capacity.i.i885 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 2, i32 0 + %237 = load i64, i64* %_M_allocated_capacity.i.i885, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i886 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 2, i32 0 + store i64 %237, i64* %_M_allocated_capacity.i.i.i886, align 8, !tbaa !15, !alias.scope !119 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890: ; preds = %if.then.i.i883, %if.else.i.i887 + %_M_string_length.i20.i.i888 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 1 + %238 = load i64, i64* %_M_string_length.i20.i.i888, align 8, !tbaa !59 + %_M_string_length.i.i2.i889 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 1 + store i64 %238, i64* %_M_string_length.i.i2.i889, align 8, !tbaa !59, !alias.scope !119 + %239 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i878 to %union.anon** + store %union.anon* %236, %union.anon** %239, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i888, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i880, align 1, !tbaa !42 + %_M_p.i.i.i.i891 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 0, i32 0 + %240 = load i8*, i8** %_M_p.i.i.i.i891, align 8, !tbaa !62 + %cmp.i.i.i893 = icmp eq i8* %240, %229 + br i1 %cmp.i.i.i893, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895, label %if.then.i.i894 + +if.then.i.i894: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890 + call void @_ZdlPv(i8* %240) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890, %if.then.i.i894 + call void @llvm.lifetime.end(i64 32, i8* nonnull %226) #2 + %_M_p.i.i896 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 + %241 = load i8*, i8** %_M_p.i.i896, align 8, !tbaa !62 + %call51 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %241, i32 0, i64 1, i64 256, i64 1, i64 1) %242 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %242) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %242) #2 %243 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp52 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %243) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %243) #2 %244 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 2 %245 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp52 to %union.anon** - store %union.anon* %244, %union.anon** %245, align 8, !tbaa !52 + store %union.anon* %244, %union.anon** %245, align 8, !tbaa !58 %246 = bitcast %union.anon* %244 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %246, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.38, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i912 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i912, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %246, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.56, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i920 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i920, align 8, !tbaa !59 %247 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %247, align 2, !tbaa !36 - %248 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !104 - %249 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !104 - %call3.i.i.i917 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp52, i64 0, i64 0, i8* %249, i64 %248) #7, !noalias !104 + store i8 0, i8* %247, align 2, !tbaa !42 + %248 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !122 + %249 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !122 + %call3.i.i.i925 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp52, i64 0, i64 0, i8* %249, i64 %248) #2, !noalias !122 %250 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 2 %251 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_w_path to %union.anon** - store %union.anon* %250, %union.anon** %251, align 8, !tbaa !52, !alias.scope !104 - %_M_p.i.i23.i.i918 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 0, i32 0 - %252 = load i8*, i8** %_M_p.i.i23.i.i918, align 8, !tbaa !56 - %253 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 2 - %arraydecay.i.i.i.i919 = bitcast %union.anon* %253 to i8* - %cmp.i.i.i920 = icmp eq i8* %252, %arraydecay.i.i.i.i919 - br i1 %cmp.i.i.i920, label %if.then.i.i922, label %if.else.i.i926 - -if.then.i.i922: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887 - %arraydecay.i.i.i921 = bitcast %union.anon* %250 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i921, i8* %252, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929 - -if.else.i.i926: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887 - %_M_p.i21.i.i923 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 - store i8* %252, i8** %_M_p.i21.i.i923, align 8, !tbaa !56, !alias.scope !104 - %_M_allocated_capacity.i.i924 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 2, i32 0 - %254 = load i64, i64* %_M_allocated_capacity.i.i924, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i925 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 2, i32 0 - store i64 %254, i64* %_M_allocated_capacity.i.i.i925, align 8, !tbaa !12, !alias.scope !104 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929: ; preds = %if.then.i.i922, %if.else.i.i926 - %_M_string_length.i20.i.i927 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 1 - %255 = load i64, i64* %_M_string_length.i20.i.i927, align 8, !tbaa !53 - %_M_string_length.i.i2.i928 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 1 - store i64 %255, i64* %_M_string_length.i.i2.i928, align 8, !tbaa !53, !alias.scope !104 - %256 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i917 to %union.anon** - store %union.anon* %253, %union.anon** %256, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i927, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i919, align 1, !tbaa !36 - %_M_p.i.i.i.i930 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 0, i32 0 - %257 = load i8*, i8** %_M_p.i.i.i.i930, align 8, !tbaa !56 - %cmp.i.i.i932 = icmp eq i8* %257, %246 - br i1 %cmp.i.i.i932, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934, label %if.then.i.i933 - -if.then.i.i933: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929 - call void @_ZdlPv(i8* %257) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929, %if.then.i.i933 - call void @llvm.lifetime.end(i64 32, i8* nonnull %243) #7 - %_M_p.i.i935 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 - %258 = load i8*, i8** %_M_p.i.i935, align 8, !tbaa !56 - %call55 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %258, i32 0, i32 256, i32 256, i32 3, i32 3) + store %union.anon* %250, %union.anon** %251, align 8, !tbaa !58, !alias.scope !122 + %_M_p.i.i23.i.i926 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 0, i32 0 + %252 = load i8*, i8** %_M_p.i.i23.i.i926, align 8, !tbaa !62 + %253 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 2 + %arraydecay.i.i.i.i927 = bitcast %union.anon* %253 to i8* + %cmp.i.i.i928 = icmp eq i8* %252, %arraydecay.i.i.i.i927 + br i1 %cmp.i.i.i928, label %if.then.i.i930, label %if.else.i.i934 + +if.then.i.i930: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895 + %arraydecay.i.i.i929 = bitcast %union.anon* %250 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i929, i8* %252, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937 + +if.else.i.i934: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895 + %_M_p.i21.i.i931 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 + store i8* %252, i8** %_M_p.i21.i.i931, align 8, !tbaa !62, !alias.scope !122 + %_M_allocated_capacity.i.i932 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 2, i32 0 + %254 = load i64, i64* %_M_allocated_capacity.i.i932, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i933 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 2, i32 0 + store i64 %254, i64* %_M_allocated_capacity.i.i.i933, align 8, !tbaa !15, !alias.scope !122 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937: ; preds = %if.then.i.i930, %if.else.i.i934 + %_M_string_length.i20.i.i935 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 1 + %255 = load i64, i64* %_M_string_length.i20.i.i935, align 8, !tbaa !59 + %_M_string_length.i.i2.i936 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 1 + store i64 %255, i64* %_M_string_length.i.i2.i936, align 8, !tbaa !59, !alias.scope !122 + %256 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i925 to %union.anon** + store %union.anon* %253, %union.anon** %256, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i935, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i927, align 1, !tbaa !42 + %_M_p.i.i.i.i938 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 0, i32 0 + %257 = load i8*, i8** %_M_p.i.i.i.i938, align 8, !tbaa !62 + %cmp.i.i.i940 = icmp eq i8* %257, %246 + br i1 %cmp.i.i.i940, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942, label %if.then.i.i941 + +if.then.i.i941: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937 + call void @_ZdlPv(i8* %257) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937, %if.then.i.i941 + call void @llvm.lifetime.end(i64 32, i8* nonnull %243) #2 + %_M_p.i.i943 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 + %258 = load i8*, i8** %_M_p.i.i943, align 8, !tbaa !62 + %call55 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %258, i32 0, i64 256, i64 256, i64 3, i64 3) %259 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %259) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %259) #2 %260 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp56 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %260) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %260) #2 %261 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 2 %262 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp56 to %union.anon** - store %union.anon* %261, %union.anon** %262, align 8, !tbaa !52 + store %union.anon* %261, %union.anon** %262, align 8, !tbaa !58 %263 = bitcast %union.anon* %261 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %263, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.39, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i959 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i959, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %263, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.57, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i967 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i967, align 8, !tbaa !59 %264 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %264, align 2, !tbaa !36 - %265 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !107 - %266 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !107 - %call3.i.i.i964 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp56, i64 0, i64 0, i8* %266, i64 %265) #7, !noalias !107 + store i8 0, i8* %264, align 2, !tbaa !42 + %265 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !125 + %266 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !125 + %call3.i.i.i972 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp56, i64 0, i64 0, i8* %266, i64 %265) #2, !noalias !125 %267 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 2 %268 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_b_path to %union.anon** - store %union.anon* %267, %union.anon** %268, align 8, !tbaa !52, !alias.scope !107 - %_M_p.i.i23.i.i965 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 0, i32 0 - %269 = load i8*, i8** %_M_p.i.i23.i.i965, align 8, !tbaa !56 - %270 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 2 - %arraydecay.i.i.i.i966 = bitcast %union.anon* %270 to i8* - %cmp.i.i.i967 = icmp eq i8* %269, %arraydecay.i.i.i.i966 - br i1 %cmp.i.i.i967, label %if.then.i.i969, label %if.else.i.i973 - -if.then.i.i969: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934 - %arraydecay.i.i.i968 = bitcast %union.anon* %267 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i968, i8* %269, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976 - -if.else.i.i973: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934 - %_M_p.i21.i.i970 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 - store i8* %269, i8** %_M_p.i21.i.i970, align 8, !tbaa !56, !alias.scope !107 - %_M_allocated_capacity.i.i971 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 2, i32 0 - %271 = load i64, i64* %_M_allocated_capacity.i.i971, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i972 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 2, i32 0 - store i64 %271, i64* %_M_allocated_capacity.i.i.i972, align 8, !tbaa !12, !alias.scope !107 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976: ; preds = %if.then.i.i969, %if.else.i.i973 - %_M_string_length.i20.i.i974 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 1 - %272 = load i64, i64* %_M_string_length.i20.i.i974, align 8, !tbaa !53 - %_M_string_length.i.i2.i975 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 1 - store i64 %272, i64* %_M_string_length.i.i2.i975, align 8, !tbaa !53, !alias.scope !107 - %273 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i964 to %union.anon** - store %union.anon* %270, %union.anon** %273, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i974, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i966, align 1, !tbaa !36 - %_M_p.i.i.i.i977 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 0, i32 0 - %274 = load i8*, i8** %_M_p.i.i.i.i977, align 8, !tbaa !56 - %cmp.i.i.i979 = icmp eq i8* %274, %263 - br i1 %cmp.i.i.i979, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981, label %if.then.i.i980 - -if.then.i.i980: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976 - call void @_ZdlPv(i8* %274) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976, %if.then.i.i980 - call void @llvm.lifetime.end(i64 32, i8* nonnull %260) #7 - %_M_p.i.i982 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 - %275 = load i8*, i8** %_M_p.i.i982, align 8, !tbaa !56 - %call59 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %275, i32 0, i32 1, i32 256, i32 1, i32 1) + store %union.anon* %267, %union.anon** %268, align 8, !tbaa !58, !alias.scope !125 + %_M_p.i.i23.i.i973 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 0, i32 0 + %269 = load i8*, i8** %_M_p.i.i23.i.i973, align 8, !tbaa !62 + %270 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 2 + %arraydecay.i.i.i.i974 = bitcast %union.anon* %270 to i8* + %cmp.i.i.i975 = icmp eq i8* %269, %arraydecay.i.i.i.i974 + br i1 %cmp.i.i.i975, label %if.then.i.i977, label %if.else.i.i981 + +if.then.i.i977: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942 + %arraydecay.i.i.i976 = bitcast %union.anon* %267 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i976, i8* %269, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984 + +if.else.i.i981: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942 + %_M_p.i21.i.i978 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 + store i8* %269, i8** %_M_p.i21.i.i978, align 8, !tbaa !62, !alias.scope !125 + %_M_allocated_capacity.i.i979 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 2, i32 0 + %271 = load i64, i64* %_M_allocated_capacity.i.i979, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i980 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 2, i32 0 + store i64 %271, i64* %_M_allocated_capacity.i.i.i980, align 8, !tbaa !15, !alias.scope !125 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984: ; preds = %if.then.i.i977, %if.else.i.i981 + %_M_string_length.i20.i.i982 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 1 + %272 = load i64, i64* %_M_string_length.i20.i.i982, align 8, !tbaa !59 + %_M_string_length.i.i2.i983 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 1 + store i64 %272, i64* %_M_string_length.i.i2.i983, align 8, !tbaa !59, !alias.scope !125 + %273 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i972 to %union.anon** + store %union.anon* %270, %union.anon** %273, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i982, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i974, align 1, !tbaa !42 + %_M_p.i.i.i.i985 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 0, i32 0 + %274 = load i8*, i8** %_M_p.i.i.i.i985, align 8, !tbaa !62 + %cmp.i.i.i987 = icmp eq i8* %274, %263 + br i1 %cmp.i.i.i987, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989, label %if.then.i.i988 + +if.then.i.i988: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984 + call void @_ZdlPv(i8* %274) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984, %if.then.i.i988 + call void @llvm.lifetime.end(i64 32, i8* nonnull %260) #2 + %_M_p.i.i990 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 + %275 = load i8*, i8** %_M_p.i.i990, align 8, !tbaa !62 + %call59 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %275, i32 0, i64 1, i64 256, i64 1, i64 1) %276 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %276) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %276) #2 %277 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp60 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %277) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %277) #2 %278 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 2 %279 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp60 to %union.anon** - store %union.anon* %278, %union.anon** %279, align 8, !tbaa !52 + store %union.anon* %278, %union.anon** %279, align 8, !tbaa !58 %280 = bitcast %union.anon* %278 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %280, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.40, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1006 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1006, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %280, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.58, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1014 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1014, align 8, !tbaa !59 %281 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %281, align 2, !tbaa !36 - %282 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !110 - %283 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !110 - %call3.i.i.i1011 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp60, i64 0, i64 0, i8* %283, i64 %282) #7, !noalias !110 + store i8 0, i8* %281, align 2, !tbaa !42 + %282 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !128 + %283 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !128 + %call3.i.i.i1019 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp60, i64 0, i64 0, i8* %283, i64 %282) #2, !noalias !128 %284 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 2 %285 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_w_path to %union.anon** - store %union.anon* %284, %union.anon** %285, align 8, !tbaa !52, !alias.scope !110 - %_M_p.i.i23.i.i1012 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 0, i32 0 - %286 = load i8*, i8** %_M_p.i.i23.i.i1012, align 8, !tbaa !56 - %287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 2 - %arraydecay.i.i.i.i1013 = bitcast %union.anon* %287 to i8* - %cmp.i.i.i1014 = icmp eq i8* %286, %arraydecay.i.i.i.i1013 - br i1 %cmp.i.i.i1014, label %if.then.i.i1016, label %if.else.i.i1020 - -if.then.i.i1016: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981 - %arraydecay.i.i.i1015 = bitcast %union.anon* %284 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1015, i8* %286, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023 - -if.else.i.i1020: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981 - %_M_p.i21.i.i1017 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 - store i8* %286, i8** %_M_p.i21.i.i1017, align 8, !tbaa !56, !alias.scope !110 - %_M_allocated_capacity.i.i1018 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 2, i32 0 - %288 = load i64, i64* %_M_allocated_capacity.i.i1018, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1019 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 2, i32 0 - store i64 %288, i64* %_M_allocated_capacity.i.i.i1019, align 8, !tbaa !12, !alias.scope !110 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023: ; preds = %if.then.i.i1016, %if.else.i.i1020 - %_M_string_length.i20.i.i1021 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 1 - %289 = load i64, i64* %_M_string_length.i20.i.i1021, align 8, !tbaa !53 - %_M_string_length.i.i2.i1022 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 1 - store i64 %289, i64* %_M_string_length.i.i2.i1022, align 8, !tbaa !53, !alias.scope !110 - %290 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1011 to %union.anon** - store %union.anon* %287, %union.anon** %290, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1021, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1013, align 1, !tbaa !36 - %_M_p.i.i.i.i1024 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 0, i32 0 - %291 = load i8*, i8** %_M_p.i.i.i.i1024, align 8, !tbaa !56 - %cmp.i.i.i1026 = icmp eq i8* %291, %280 - br i1 %cmp.i.i.i1026, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028, label %if.then.i.i1027 - -if.then.i.i1027: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023 - call void @_ZdlPv(i8* %291) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023, %if.then.i.i1027 - call void @llvm.lifetime.end(i64 32, i8* nonnull %277) #7 - %_M_p.i.i1029 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 - %292 = load i8*, i8** %_M_p.i.i1029, align 8, !tbaa !56 - %call63 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %292, i32 0, i32 512, i32 256, i32 3, i32 3) + store %union.anon* %284, %union.anon** %285, align 8, !tbaa !58, !alias.scope !128 + %_M_p.i.i23.i.i1020 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 0, i32 0 + %286 = load i8*, i8** %_M_p.i.i23.i.i1020, align 8, !tbaa !62 + %287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 2 + %arraydecay.i.i.i.i1021 = bitcast %union.anon* %287 to i8* + %cmp.i.i.i1022 = icmp eq i8* %286, %arraydecay.i.i.i.i1021 + br i1 %cmp.i.i.i1022, label %if.then.i.i1024, label %if.else.i.i1028 + +if.then.i.i1024: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989 + %arraydecay.i.i.i1023 = bitcast %union.anon* %284 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1023, i8* %286, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031 + +if.else.i.i1028: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989 + %_M_p.i21.i.i1025 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 + store i8* %286, i8** %_M_p.i21.i.i1025, align 8, !tbaa !62, !alias.scope !128 + %_M_allocated_capacity.i.i1026 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 2, i32 0 + %288 = load i64, i64* %_M_allocated_capacity.i.i1026, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1027 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 2, i32 0 + store i64 %288, i64* %_M_allocated_capacity.i.i.i1027, align 8, !tbaa !15, !alias.scope !128 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031: ; preds = %if.then.i.i1024, %if.else.i.i1028 + %_M_string_length.i20.i.i1029 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 1 + %289 = load i64, i64* %_M_string_length.i20.i.i1029, align 8, !tbaa !59 + %_M_string_length.i.i2.i1030 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 1 + store i64 %289, i64* %_M_string_length.i.i2.i1030, align 8, !tbaa !59, !alias.scope !128 + %290 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1019 to %union.anon** + store %union.anon* %287, %union.anon** %290, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1029, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1021, align 1, !tbaa !42 + %_M_p.i.i.i.i1032 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 0, i32 0 + %291 = load i8*, i8** %_M_p.i.i.i.i1032, align 8, !tbaa !62 + %cmp.i.i.i1034 = icmp eq i8* %291, %280 + br i1 %cmp.i.i.i1034, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036, label %if.then.i.i1035 + +if.then.i.i1035: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031 + call void @_ZdlPv(i8* %291) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031, %if.then.i.i1035 + call void @llvm.lifetime.end(i64 32, i8* nonnull %277) #2 + %_M_p.i.i1037 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 + %292 = load i8*, i8** %_M_p.i.i1037, align 8, !tbaa !62 + %call63 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %292, i32 0, i64 512, i64 256, i64 3, i64 3) %293 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %293) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %293) #2 %294 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp64 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %294) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %294) #2 %295 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 2 %296 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp64 to %union.anon** - store %union.anon* %295, %union.anon** %296, align 8, !tbaa !52 + store %union.anon* %295, %union.anon** %296, align 8, !tbaa !58 %297 = bitcast %union.anon* %295 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %297, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.41, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1048 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1048, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %297, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.59, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1056 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1056, align 8, !tbaa !59 %298 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %298, align 2, !tbaa !36 - %299 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !113 - %300 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !113 - %call3.i.i.i1053 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp64, i64 0, i64 0, i8* %300, i64 %299) #7, !noalias !113 + store i8 0, i8* %298, align 2, !tbaa !42 + %299 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !131 + %300 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !131 + %call3.i.i.i1061 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp64, i64 0, i64 0, i8* %300, i64 %299) #2, !noalias !131 %301 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 2 %302 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_b_path to %union.anon** - store %union.anon* %301, %union.anon** %302, align 8, !tbaa !52, !alias.scope !113 - %_M_p.i.i23.i.i1054 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 0, i32 0 - %303 = load i8*, i8** %_M_p.i.i23.i.i1054, align 8, !tbaa !56 - %304 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 2 - %arraydecay.i.i.i.i1055 = bitcast %union.anon* %304 to i8* - %cmp.i.i.i1056 = icmp eq i8* %303, %arraydecay.i.i.i.i1055 - br i1 %cmp.i.i.i1056, label %if.then.i.i1058, label %if.else.i.i1062 - -if.then.i.i1058: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028 - %arraydecay.i.i.i1057 = bitcast %union.anon* %301 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1057, i8* %303, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065 - -if.else.i.i1062: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028 - %_M_p.i21.i.i1059 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 - store i8* %303, i8** %_M_p.i21.i.i1059, align 8, !tbaa !56, !alias.scope !113 - %_M_allocated_capacity.i.i1060 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 2, i32 0 - %305 = load i64, i64* %_M_allocated_capacity.i.i1060, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1061 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 2, i32 0 - store i64 %305, i64* %_M_allocated_capacity.i.i.i1061, align 8, !tbaa !12, !alias.scope !113 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065: ; preds = %if.then.i.i1058, %if.else.i.i1062 - %_M_string_length.i20.i.i1063 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 1 - %306 = load i64, i64* %_M_string_length.i20.i.i1063, align 8, !tbaa !53 - %_M_string_length.i.i2.i1064 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 1 - store i64 %306, i64* %_M_string_length.i.i2.i1064, align 8, !tbaa !53, !alias.scope !113 - %307 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1053 to %union.anon** - store %union.anon* %304, %union.anon** %307, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1063, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1055, align 1, !tbaa !36 - %_M_p.i.i.i.i1066 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 0, i32 0 - %308 = load i8*, i8** %_M_p.i.i.i.i1066, align 8, !tbaa !56 - %cmp.i.i.i1068 = icmp eq i8* %308, %297 - br i1 %cmp.i.i.i1068, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070, label %if.then.i.i1069 - -if.then.i.i1069: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065 - call void @_ZdlPv(i8* %308) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065, %if.then.i.i1069 - call void @llvm.lifetime.end(i64 32, i8* nonnull %294) #7 - %_M_p.i.i1071 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 - %309 = load i8*, i8** %_M_p.i.i1071, align 8, !tbaa !56 - %call67 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %309, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %301, %union.anon** %302, align 8, !tbaa !58, !alias.scope !131 + %_M_p.i.i23.i.i1062 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 0, i32 0 + %303 = load i8*, i8** %_M_p.i.i23.i.i1062, align 8, !tbaa !62 + %304 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 2 + %arraydecay.i.i.i.i1063 = bitcast %union.anon* %304 to i8* + %cmp.i.i.i1064 = icmp eq i8* %303, %arraydecay.i.i.i.i1063 + br i1 %cmp.i.i.i1064, label %if.then.i.i1066, label %if.else.i.i1070 + +if.then.i.i1066: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036 + %arraydecay.i.i.i1065 = bitcast %union.anon* %301 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1065, i8* %303, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073 + +if.else.i.i1070: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036 + %_M_p.i21.i.i1067 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 + store i8* %303, i8** %_M_p.i21.i.i1067, align 8, !tbaa !62, !alias.scope !131 + %_M_allocated_capacity.i.i1068 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 2, i32 0 + %305 = load i64, i64* %_M_allocated_capacity.i.i1068, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1069 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 2, i32 0 + store i64 %305, i64* %_M_allocated_capacity.i.i.i1069, align 8, !tbaa !15, !alias.scope !131 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073: ; preds = %if.then.i.i1066, %if.else.i.i1070 + %_M_string_length.i20.i.i1071 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 1 + %306 = load i64, i64* %_M_string_length.i20.i.i1071, align 8, !tbaa !59 + %_M_string_length.i.i2.i1072 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 1 + store i64 %306, i64* %_M_string_length.i.i2.i1072, align 8, !tbaa !59, !alias.scope !131 + %307 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1061 to %union.anon** + store %union.anon* %304, %union.anon** %307, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1071, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1063, align 1, !tbaa !42 + %_M_p.i.i.i.i1074 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 0, i32 0 + %308 = load i8*, i8** %_M_p.i.i.i.i1074, align 8, !tbaa !62 + %cmp.i.i.i1076 = icmp eq i8* %308, %297 + br i1 %cmp.i.i.i1076, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078, label %if.then.i.i1077 + +if.then.i.i1077: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073 + call void @_ZdlPv(i8* %308) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073, %if.then.i.i1077 + call void @llvm.lifetime.end(i64 32, i8* nonnull %294) #2 + %_M_p.i.i1079 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 + %309 = load i8*, i8** %_M_p.i.i1079, align 8, !tbaa !62 + %call67 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %309, i32 0, i64 1, i64 512, i64 1, i64 1) %310 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %310) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %310) #2 %311 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp68 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %311) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %311) #2 %312 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 2 %313 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp68 to %union.anon** - store %union.anon* %312, %union.anon** %313, align 8, !tbaa !52 + store %union.anon* %312, %union.anon** %313, align 8, !tbaa !58 %314 = bitcast %union.anon* %312 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %314, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.42, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1085 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1085, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %314, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.60, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1094 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1094, align 8, !tbaa !59 %315 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %315, align 2, !tbaa !36 - %316 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !116 - %317 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !116 - %call3.i.i.i1090 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp68, i64 0, i64 0, i8* %317, i64 %316) #7, !noalias !116 + store i8 0, i8* %315, align 2, !tbaa !42 + %316 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !134 + %317 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !134 + %call3.i.i.i1099 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp68, i64 0, i64 0, i8* %317, i64 %316) #2, !noalias !134 %318 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 2 %319 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_w_path to %union.anon** - store %union.anon* %318, %union.anon** %319, align 8, !tbaa !52, !alias.scope !116 - %_M_p.i.i23.i.i1091 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 0, i32 0 - %320 = load i8*, i8** %_M_p.i.i23.i.i1091, align 8, !tbaa !56 - %321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 2 - %arraydecay.i.i.i.i1092 = bitcast %union.anon* %321 to i8* - %cmp.i.i.i1093 = icmp eq i8* %320, %arraydecay.i.i.i.i1092 - br i1 %cmp.i.i.i1093, label %if.then.i.i1095, label %if.else.i.i1099 - -if.then.i.i1095: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070 - %arraydecay.i.i.i1094 = bitcast %union.anon* %318 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1094, i8* %320, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102 - -if.else.i.i1099: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070 - %_M_p.i21.i.i1096 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 - store i8* %320, i8** %_M_p.i21.i.i1096, align 8, !tbaa !56, !alias.scope !116 - %_M_allocated_capacity.i.i1097 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 2, i32 0 - %322 = load i64, i64* %_M_allocated_capacity.i.i1097, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1098 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 2, i32 0 - store i64 %322, i64* %_M_allocated_capacity.i.i.i1098, align 8, !tbaa !12, !alias.scope !116 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102: ; preds = %if.then.i.i1095, %if.else.i.i1099 - %_M_string_length.i20.i.i1100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 1 - %323 = load i64, i64* %_M_string_length.i20.i.i1100, align 8, !tbaa !53 - %_M_string_length.i.i2.i1101 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 1 - store i64 %323, i64* %_M_string_length.i.i2.i1101, align 8, !tbaa !53, !alias.scope !116 - %324 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1090 to %union.anon** - store %union.anon* %321, %union.anon** %324, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1100, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1092, align 1, !tbaa !36 - %_M_p.i.i.i.i1103 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 0, i32 0 - %325 = load i8*, i8** %_M_p.i.i.i.i1103, align 8, !tbaa !56 - %cmp.i.i.i1105 = icmp eq i8* %325, %314 - br i1 %cmp.i.i.i1105, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107, label %if.then.i.i1106 - -if.then.i.i1106: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102 - call void @_ZdlPv(i8* %325) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102, %if.then.i.i1106 - call void @llvm.lifetime.end(i64 32, i8* nonnull %311) #7 - %_M_p.i.i1108 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 - %326 = load i8*, i8** %_M_p.i.i1108, align 8, !tbaa !56 - %call71 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %326, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %318, %union.anon** %319, align 8, !tbaa !58, !alias.scope !134 + %_M_p.i.i23.i.i1100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 0, i32 0 + %320 = load i8*, i8** %_M_p.i.i23.i.i1100, align 8, !tbaa !62 + %321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 2 + %arraydecay.i.i.i.i1101 = bitcast %union.anon* %321 to i8* + %cmp.i.i.i1102 = icmp eq i8* %320, %arraydecay.i.i.i.i1101 + br i1 %cmp.i.i.i1102, label %if.then.i.i1104, label %if.else.i.i1108 + +if.then.i.i1104: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078 + %arraydecay.i.i.i1103 = bitcast %union.anon* %318 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1103, i8* %320, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111 + +if.else.i.i1108: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078 + %_M_p.i21.i.i1105 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 + store i8* %320, i8** %_M_p.i21.i.i1105, align 8, !tbaa !62, !alias.scope !134 + %_M_allocated_capacity.i.i1106 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 2, i32 0 + %322 = load i64, i64* %_M_allocated_capacity.i.i1106, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1107 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 2, i32 0 + store i64 %322, i64* %_M_allocated_capacity.i.i.i1107, align 8, !tbaa !15, !alias.scope !134 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111: ; preds = %if.then.i.i1104, %if.else.i.i1108 + %_M_string_length.i20.i.i1109 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 1 + %323 = load i64, i64* %_M_string_length.i20.i.i1109, align 8, !tbaa !59 + %_M_string_length.i.i2.i1110 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 1 + store i64 %323, i64* %_M_string_length.i.i2.i1110, align 8, !tbaa !59, !alias.scope !134 + %324 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1099 to %union.anon** + store %union.anon* %321, %union.anon** %324, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1109, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1101, align 1, !tbaa !42 + %_M_p.i.i.i.i1112 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 0, i32 0 + %325 = load i8*, i8** %_M_p.i.i.i.i1112, align 8, !tbaa !62 + %cmp.i.i.i1114 = icmp eq i8* %325, %314 + br i1 %cmp.i.i.i1114, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116, label %if.then.i.i1115 + +if.then.i.i1115: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111 + call void @_ZdlPv(i8* %325) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111, %if.then.i.i1115 + call void @llvm.lifetime.end(i64 32, i8* nonnull %311) #2 + %_M_p.i.i1117 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 + %326 = load i8*, i8** %_M_p.i.i1117, align 8, !tbaa !62 + %call71 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %326, i32 0, i64 512, i64 512, i64 3, i64 3) %327 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %327) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %327) #2 %328 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp72 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %328) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %328) #2 %329 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 2 %330 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp72 to %union.anon** - store %union.anon* %329, %union.anon** %330, align 8, !tbaa !52 + store %union.anon* %329, %union.anon** %330, align 8, !tbaa !58 %331 = bitcast %union.anon* %329 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %331, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.43, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1122 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1122, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %331, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.61, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1131 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1131, align 8, !tbaa !59 %332 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %332, align 2, !tbaa !36 - %333 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !119 - %334 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !119 - %call3.i.i.i1127 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp72, i64 0, i64 0, i8* %334, i64 %333) #7, !noalias !119 + store i8 0, i8* %332, align 2, !tbaa !42 + %333 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !137 + %334 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !137 + %call3.i.i.i1136 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp72, i64 0, i64 0, i8* %334, i64 %333) #2, !noalias !137 %335 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 2 %336 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_b_path to %union.anon** - store %union.anon* %335, %union.anon** %336, align 8, !tbaa !52, !alias.scope !119 - %_M_p.i.i23.i.i1128 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 0, i32 0 - %337 = load i8*, i8** %_M_p.i.i23.i.i1128, align 8, !tbaa !56 - %338 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 2 - %arraydecay.i.i.i.i1129 = bitcast %union.anon* %338 to i8* - %cmp.i.i.i1130 = icmp eq i8* %337, %arraydecay.i.i.i.i1129 - br i1 %cmp.i.i.i1130, label %if.then.i.i1132, label %if.else.i.i1136 - -if.then.i.i1132: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107 - %arraydecay.i.i.i1131 = bitcast %union.anon* %335 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1131, i8* %337, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139 - -if.else.i.i1136: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107 - %_M_p.i21.i.i1133 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 - store i8* %337, i8** %_M_p.i21.i.i1133, align 8, !tbaa !56, !alias.scope !119 - %_M_allocated_capacity.i.i1134 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 2, i32 0 - %339 = load i64, i64* %_M_allocated_capacity.i.i1134, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1135 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 2, i32 0 - store i64 %339, i64* %_M_allocated_capacity.i.i.i1135, align 8, !tbaa !12, !alias.scope !119 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139: ; preds = %if.then.i.i1132, %if.else.i.i1136 - %_M_string_length.i20.i.i1137 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 1 - %340 = load i64, i64* %_M_string_length.i20.i.i1137, align 8, !tbaa !53 - %_M_string_length.i.i2.i1138 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 1 - store i64 %340, i64* %_M_string_length.i.i2.i1138, align 8, !tbaa !53, !alias.scope !119 - %341 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1127 to %union.anon** - store %union.anon* %338, %union.anon** %341, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1137, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1129, align 1, !tbaa !36 - %_M_p.i.i.i.i1140 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 0, i32 0 - %342 = load i8*, i8** %_M_p.i.i.i.i1140, align 8, !tbaa !56 - %cmp.i.i.i1142 = icmp eq i8* %342, %331 - br i1 %cmp.i.i.i1142, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144, label %if.then.i.i1143 - -if.then.i.i1143: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139 - call void @_ZdlPv(i8* %342) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139, %if.then.i.i1143 - call void @llvm.lifetime.end(i64 32, i8* nonnull %328) #7 - %_M_p.i.i1145 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 - %343 = load i8*, i8** %_M_p.i.i1145, align 8, !tbaa !56 - %call75 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %343, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %335, %union.anon** %336, align 8, !tbaa !58, !alias.scope !137 + %_M_p.i.i23.i.i1137 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 0, i32 0 + %337 = load i8*, i8** %_M_p.i.i23.i.i1137, align 8, !tbaa !62 + %338 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 2 + %arraydecay.i.i.i.i1138 = bitcast %union.anon* %338 to i8* + %cmp.i.i.i1139 = icmp eq i8* %337, %arraydecay.i.i.i.i1138 + br i1 %cmp.i.i.i1139, label %if.then.i.i1141, label %if.else.i.i1145 + +if.then.i.i1141: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116 + %arraydecay.i.i.i1140 = bitcast %union.anon* %335 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1140, i8* %337, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148 + +if.else.i.i1145: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116 + %_M_p.i21.i.i1142 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 + store i8* %337, i8** %_M_p.i21.i.i1142, align 8, !tbaa !62, !alias.scope !137 + %_M_allocated_capacity.i.i1143 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 2, i32 0 + %339 = load i64, i64* %_M_allocated_capacity.i.i1143, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1144 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 2, i32 0 + store i64 %339, i64* %_M_allocated_capacity.i.i.i1144, align 8, !tbaa !15, !alias.scope !137 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148: ; preds = %if.then.i.i1141, %if.else.i.i1145 + %_M_string_length.i20.i.i1146 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 1 + %340 = load i64, i64* %_M_string_length.i20.i.i1146, align 8, !tbaa !59 + %_M_string_length.i.i2.i1147 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 1 + store i64 %340, i64* %_M_string_length.i.i2.i1147, align 8, !tbaa !59, !alias.scope !137 + %341 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1136 to %union.anon** + store %union.anon* %338, %union.anon** %341, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1146, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1138, align 1, !tbaa !42 + %_M_p.i.i.i.i1149 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 0, i32 0 + %342 = load i8*, i8** %_M_p.i.i.i.i1149, align 8, !tbaa !62 + %cmp.i.i.i1151 = icmp eq i8* %342, %331 + br i1 %cmp.i.i.i1151, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153, label %if.then.i.i1152 + +if.then.i.i1152: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148 + call void @_ZdlPv(i8* %342) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148, %if.then.i.i1152 + call void @llvm.lifetime.end(i64 32, i8* nonnull %328) #2 + %_M_p.i.i1154 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 + %343 = load i8*, i8** %_M_p.i.i1154, align 8, !tbaa !62 + %call75 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %343, i32 0, i64 1, i64 512, i64 1, i64 1) %344 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %344) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %344) #2 %345 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp76 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %345) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %345) #2 %346 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 2 %347 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp76 to %union.anon** - store %union.anon* %346, %union.anon** %347, align 8, !tbaa !52 + store %union.anon* %346, %union.anon** %347, align 8, !tbaa !58 %348 = bitcast %union.anon* %346 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %348, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.44, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1159 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1159, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %348, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.62, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1168 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1168, align 8, !tbaa !59 %349 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %349, align 1, !tbaa !36 - %350 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !122 - %351 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !122 - %call3.i.i.i1164 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp76, i64 0, i64 0, i8* %351, i64 %350) #7, !noalias !122 + store i8 0, i8* %349, align 1, !tbaa !42 + %350 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !140 + %351 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !140 + %call3.i.i.i1173 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp76, i64 0, i64 0, i8* %351, i64 %350) #2, !noalias !140 %352 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 2 %353 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_w_path to %union.anon** - store %union.anon* %352, %union.anon** %353, align 8, !tbaa !52, !alias.scope !122 - %_M_p.i.i23.i.i1165 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 0, i32 0 - %354 = load i8*, i8** %_M_p.i.i23.i.i1165, align 8, !tbaa !56 - %355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 2 - %arraydecay.i.i.i.i1166 = bitcast %union.anon* %355 to i8* - %cmp.i.i.i1167 = icmp eq i8* %354, %arraydecay.i.i.i.i1166 - br i1 %cmp.i.i.i1167, label %if.then.i.i1169, label %if.else.i.i1173 - -if.then.i.i1169: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144 - %arraydecay.i.i.i1168 = bitcast %union.anon* %352 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1168, i8* %354, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176 - -if.else.i.i1173: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144 - %_M_p.i21.i.i1170 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 - store i8* %354, i8** %_M_p.i21.i.i1170, align 8, !tbaa !56, !alias.scope !122 - %_M_allocated_capacity.i.i1171 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 2, i32 0 - %356 = load i64, i64* %_M_allocated_capacity.i.i1171, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1172 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 2, i32 0 - store i64 %356, i64* %_M_allocated_capacity.i.i.i1172, align 8, !tbaa !12, !alias.scope !122 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176: ; preds = %if.then.i.i1169, %if.else.i.i1173 - %_M_string_length.i20.i.i1174 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 1 - %357 = load i64, i64* %_M_string_length.i20.i.i1174, align 8, !tbaa !53 - %_M_string_length.i.i2.i1175 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 1 - store i64 %357, i64* %_M_string_length.i.i2.i1175, align 8, !tbaa !53, !alias.scope !122 - %358 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1164 to %union.anon** - store %union.anon* %355, %union.anon** %358, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1174, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1166, align 1, !tbaa !36 - %_M_p.i.i.i.i1177 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 0, i32 0 - %359 = load i8*, i8** %_M_p.i.i.i.i1177, align 8, !tbaa !56 - %cmp.i.i.i1179 = icmp eq i8* %359, %348 - br i1 %cmp.i.i.i1179, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181, label %if.then.i.i1180 - -if.then.i.i1180: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176 - call void @_ZdlPv(i8* %359) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176, %if.then.i.i1180 - call void @llvm.lifetime.end(i64 32, i8* nonnull %345) #7 - %_M_p.i.i1182 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 - %360 = load i8*, i8** %_M_p.i.i1182, align 8, !tbaa !56 - %call79 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %360, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %352, %union.anon** %353, align 8, !tbaa !58, !alias.scope !140 + %_M_p.i.i23.i.i1174 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 0, i32 0 + %354 = load i8*, i8** %_M_p.i.i23.i.i1174, align 8, !tbaa !62 + %355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 2 + %arraydecay.i.i.i.i1175 = bitcast %union.anon* %355 to i8* + %cmp.i.i.i1176 = icmp eq i8* %354, %arraydecay.i.i.i.i1175 + br i1 %cmp.i.i.i1176, label %if.then.i.i1178, label %if.else.i.i1182 + +if.then.i.i1178: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153 + %arraydecay.i.i.i1177 = bitcast %union.anon* %352 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1177, i8* %354, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185 + +if.else.i.i1182: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153 + %_M_p.i21.i.i1179 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 + store i8* %354, i8** %_M_p.i21.i.i1179, align 8, !tbaa !62, !alias.scope !140 + %_M_allocated_capacity.i.i1180 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 2, i32 0 + %356 = load i64, i64* %_M_allocated_capacity.i.i1180, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1181 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 2, i32 0 + store i64 %356, i64* %_M_allocated_capacity.i.i.i1181, align 8, !tbaa !15, !alias.scope !140 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185: ; preds = %if.then.i.i1178, %if.else.i.i1182 + %_M_string_length.i20.i.i1183 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 1 + %357 = load i64, i64* %_M_string_length.i20.i.i1183, align 8, !tbaa !59 + %_M_string_length.i.i2.i1184 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 1 + store i64 %357, i64* %_M_string_length.i.i2.i1184, align 8, !tbaa !59, !alias.scope !140 + %358 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1173 to %union.anon** + store %union.anon* %355, %union.anon** %358, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1183, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1175, align 1, !tbaa !42 + %_M_p.i.i.i.i1186 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 0, i32 0 + %359 = load i8*, i8** %_M_p.i.i.i.i1186, align 8, !tbaa !62 + %cmp.i.i.i1188 = icmp eq i8* %359, %348 + br i1 %cmp.i.i.i1188, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190, label %if.then.i.i1189 + +if.then.i.i1189: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185 + call void @_ZdlPv(i8* %359) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185, %if.then.i.i1189 + call void @llvm.lifetime.end(i64 32, i8* nonnull %345) #2 + %_M_p.i.i1191 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 + %360 = load i8*, i8** %_M_p.i.i1191, align 8, !tbaa !62 + %call79 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %360, i32 0, i64 512, i64 512, i64 3, i64 3) %361 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %361) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %361) #2 %362 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp80 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %362) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %362) #2 %363 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 2 %364 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp80 to %union.anon** - store %union.anon* %363, %union.anon** %364, align 8, !tbaa !52 + store %union.anon* %363, %union.anon** %364, align 8, !tbaa !58 %365 = bitcast %union.anon* %363 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %365, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.45, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1198 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1198, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %365, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.63, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1206 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1206, align 8, !tbaa !59 %366 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %366, align 1, !tbaa !36 - %367 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !125 - %368 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !125 - %call3.i.i.i1203 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp80, i64 0, i64 0, i8* %368, i64 %367) #7, !noalias !125 + store i8 0, i8* %366, align 1, !tbaa !42 + %367 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !143 + %368 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !143 + %call3.i.i.i1211 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp80, i64 0, i64 0, i8* %368, i64 %367) #2, !noalias !143 %369 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 2 %370 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_b_path to %union.anon** - store %union.anon* %369, %union.anon** %370, align 8, !tbaa !52, !alias.scope !125 - %_M_p.i.i23.i.i1204 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 0, i32 0 - %371 = load i8*, i8** %_M_p.i.i23.i.i1204, align 8, !tbaa !56 - %372 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 2 - %arraydecay.i.i.i.i1205 = bitcast %union.anon* %372 to i8* - %cmp.i.i.i1206 = icmp eq i8* %371, %arraydecay.i.i.i.i1205 - br i1 %cmp.i.i.i1206, label %if.then.i.i1208, label %if.else.i.i1212 - -if.then.i.i1208: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181 - %arraydecay.i.i.i1207 = bitcast %union.anon* %369 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1207, i8* %371, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215 - -if.else.i.i1212: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181 - %_M_p.i21.i.i1209 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 - store i8* %371, i8** %_M_p.i21.i.i1209, align 8, !tbaa !56, !alias.scope !125 - %_M_allocated_capacity.i.i1210 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 2, i32 0 - %373 = load i64, i64* %_M_allocated_capacity.i.i1210, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1211 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 2, i32 0 - store i64 %373, i64* %_M_allocated_capacity.i.i.i1211, align 8, !tbaa !12, !alias.scope !125 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215: ; preds = %if.then.i.i1208, %if.else.i.i1212 - %_M_string_length.i20.i.i1213 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 1 - %374 = load i64, i64* %_M_string_length.i20.i.i1213, align 8, !tbaa !53 - %_M_string_length.i.i2.i1214 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 1 - store i64 %374, i64* %_M_string_length.i.i2.i1214, align 8, !tbaa !53, !alias.scope !125 - %375 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1203 to %union.anon** - store %union.anon* %372, %union.anon** %375, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1213, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1205, align 1, !tbaa !36 - %_M_p.i.i.i.i1216 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 0, i32 0 - %376 = load i8*, i8** %_M_p.i.i.i.i1216, align 8, !tbaa !56 - %cmp.i.i.i1218 = icmp eq i8* %376, %365 - br i1 %cmp.i.i.i1218, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220, label %if.then.i.i1219 - -if.then.i.i1219: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215 - call void @_ZdlPv(i8* %376) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215, %if.then.i.i1219 - call void @llvm.lifetime.end(i64 32, i8* nonnull %362) #7 - %_M_p.i.i1221 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 - %377 = load i8*, i8** %_M_p.i.i1221, align 8, !tbaa !56 - %call83 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %377, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %369, %union.anon** %370, align 8, !tbaa !58, !alias.scope !143 + %_M_p.i.i23.i.i1212 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 0, i32 0 + %371 = load i8*, i8** %_M_p.i.i23.i.i1212, align 8, !tbaa !62 + %372 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 2 + %arraydecay.i.i.i.i1213 = bitcast %union.anon* %372 to i8* + %cmp.i.i.i1214 = icmp eq i8* %371, %arraydecay.i.i.i.i1213 + br i1 %cmp.i.i.i1214, label %if.then.i.i1216, label %if.else.i.i1220 + +if.then.i.i1216: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190 + %arraydecay.i.i.i1215 = bitcast %union.anon* %369 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1215, i8* %371, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223 + +if.else.i.i1220: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190 + %_M_p.i21.i.i1217 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 + store i8* %371, i8** %_M_p.i21.i.i1217, align 8, !tbaa !62, !alias.scope !143 + %_M_allocated_capacity.i.i1218 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 2, i32 0 + %373 = load i64, i64* %_M_allocated_capacity.i.i1218, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1219 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 2, i32 0 + store i64 %373, i64* %_M_allocated_capacity.i.i.i1219, align 8, !tbaa !15, !alias.scope !143 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223: ; preds = %if.then.i.i1216, %if.else.i.i1220 + %_M_string_length.i20.i.i1221 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 1 + %374 = load i64, i64* %_M_string_length.i20.i.i1221, align 8, !tbaa !59 + %_M_string_length.i.i2.i1222 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 1 + store i64 %374, i64* %_M_string_length.i.i2.i1222, align 8, !tbaa !59, !alias.scope !143 + %375 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1211 to %union.anon** + store %union.anon* %372, %union.anon** %375, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1221, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1213, align 1, !tbaa !42 + %_M_p.i.i.i.i1224 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 0, i32 0 + %376 = load i8*, i8** %_M_p.i.i.i.i1224, align 8, !tbaa !62 + %cmp.i.i.i1226 = icmp eq i8* %376, %365 + br i1 %cmp.i.i.i1226, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228, label %if.then.i.i1227 + +if.then.i.i1227: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223 + call void @_ZdlPv(i8* %376) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223, %if.then.i.i1227 + call void @llvm.lifetime.end(i64 32, i8* nonnull %362) #2 + %_M_p.i.i1229 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 + %377 = load i8*, i8** %_M_p.i.i1229, align 8, !tbaa !62 + %call83 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %377, i32 0, i64 1, i64 512, i64 1, i64 1) %378 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %378) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %378) #2 %379 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp84 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %379) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %379) #2 %380 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 2 %381 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp84 to %union.anon** - store %union.anon* %380, %union.anon** %381, align 8, !tbaa !52 + store %union.anon* %380, %union.anon** %381, align 8, !tbaa !58 %382 = bitcast %union.anon* %380 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %382, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.46, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1256 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1256, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %382, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.64, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1250 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1250, align 8, !tbaa !59 %383 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %383, align 1, !tbaa !36 - %384 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !128 - %385 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !128 - %call3.i.i.i1261 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp84, i64 0, i64 0, i8* %385, i64 %384) #7, !noalias !128 + store i8 0, i8* %383, align 1, !tbaa !42 + %384 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !146 + %385 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !146 + %call3.i.i.i1255 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp84, i64 0, i64 0, i8* %385, i64 %384) #2, !noalias !146 %386 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 2 %387 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_w_path to %union.anon** - store %union.anon* %386, %union.anon** %387, align 8, !tbaa !52, !alias.scope !128 - %_M_p.i.i23.i.i1262 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 0, i32 0 - %388 = load i8*, i8** %_M_p.i.i23.i.i1262, align 8, !tbaa !56 - %389 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 2 - %arraydecay.i.i.i.i1263 = bitcast %union.anon* %389 to i8* - %cmp.i.i.i1264 = icmp eq i8* %388, %arraydecay.i.i.i.i1263 - br i1 %cmp.i.i.i1264, label %if.then.i.i1266, label %if.else.i.i1270 - -if.then.i.i1266: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220 - %arraydecay.i.i.i1265 = bitcast %union.anon* %386 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1265, i8* %388, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273 - -if.else.i.i1270: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220 - %_M_p.i21.i.i1267 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 - store i8* %388, i8** %_M_p.i21.i.i1267, align 8, !tbaa !56, !alias.scope !128 - %_M_allocated_capacity.i.i1268 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 2, i32 0 - %390 = load i64, i64* %_M_allocated_capacity.i.i1268, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1269 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 2, i32 0 - store i64 %390, i64* %_M_allocated_capacity.i.i.i1269, align 8, !tbaa !12, !alias.scope !128 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273: ; preds = %if.then.i.i1266, %if.else.i.i1270 - %_M_string_length.i20.i.i1271 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 1 - %391 = load i64, i64* %_M_string_length.i20.i.i1271, align 8, !tbaa !53 - %_M_string_length.i.i2.i1272 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 1 - store i64 %391, i64* %_M_string_length.i.i2.i1272, align 8, !tbaa !53, !alias.scope !128 - %392 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1261 to %union.anon** - store %union.anon* %389, %union.anon** %392, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1271, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1263, align 1, !tbaa !36 - %_M_p.i.i.i.i1274 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 0, i32 0 - %393 = load i8*, i8** %_M_p.i.i.i.i1274, align 8, !tbaa !56 - %cmp.i.i.i1276 = icmp eq i8* %393, %382 - br i1 %cmp.i.i.i1276, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278, label %if.then.i.i1277 - -if.then.i.i1277: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273 - call void @_ZdlPv(i8* %393) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273, %if.then.i.i1277 - call void @llvm.lifetime.end(i64 32, i8* nonnull %379) #7 - %_M_p.i.i1279 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 - %394 = load i8*, i8** %_M_p.i.i1279, align 8, !tbaa !56 - %call87 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %394, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %386, %union.anon** %387, align 8, !tbaa !58, !alias.scope !146 + %_M_p.i.i23.i.i1256 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 0, i32 0 + %388 = load i8*, i8** %_M_p.i.i23.i.i1256, align 8, !tbaa !62 + %389 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 2 + %arraydecay.i.i.i.i1257 = bitcast %union.anon* %389 to i8* + %cmp.i.i.i1258 = icmp eq i8* %388, %arraydecay.i.i.i.i1257 + br i1 %cmp.i.i.i1258, label %if.then.i.i1260, label %if.else.i.i1264 + +if.then.i.i1260: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228 + %arraydecay.i.i.i1259 = bitcast %union.anon* %386 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1259, i8* %388, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267 + +if.else.i.i1264: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228 + %_M_p.i21.i.i1261 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 + store i8* %388, i8** %_M_p.i21.i.i1261, align 8, !tbaa !62, !alias.scope !146 + %_M_allocated_capacity.i.i1262 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 2, i32 0 + %390 = load i64, i64* %_M_allocated_capacity.i.i1262, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1263 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 2, i32 0 + store i64 %390, i64* %_M_allocated_capacity.i.i.i1263, align 8, !tbaa !15, !alias.scope !146 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267: ; preds = %if.then.i.i1260, %if.else.i.i1264 + %_M_string_length.i20.i.i1265 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 1 + %391 = load i64, i64* %_M_string_length.i20.i.i1265, align 8, !tbaa !59 + %_M_string_length.i.i2.i1266 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 1 + store i64 %391, i64* %_M_string_length.i.i2.i1266, align 8, !tbaa !59, !alias.scope !146 + %392 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1255 to %union.anon** + store %union.anon* %389, %union.anon** %392, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1265, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1257, align 1, !tbaa !42 + %_M_p.i.i.i.i1268 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 0, i32 0 + %393 = load i8*, i8** %_M_p.i.i.i.i1268, align 8, !tbaa !62 + %cmp.i.i.i1270 = icmp eq i8* %393, %382 + br i1 %cmp.i.i.i1270, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272, label %if.then.i.i1271 + +if.then.i.i1271: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267 + call void @_ZdlPv(i8* %393) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267, %if.then.i.i1271 + call void @llvm.lifetime.end(i64 32, i8* nonnull %379) #2 + %_M_p.i.i1273 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 + %394 = load i8*, i8** %_M_p.i.i1273, align 8, !tbaa !62 + %call87 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %394, i32 0, i64 512, i64 512, i64 3, i64 3) %395 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %395) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %395) #2 %396 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp88 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %396) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %396) #2 %397 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 2 %398 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp88 to %union.anon** - store %union.anon* %397, %union.anon** %398, align 8, !tbaa !52 + store %union.anon* %397, %union.anon** %398, align 8, !tbaa !58 %399 = bitcast %union.anon* %397 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %399, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.47, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1309 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1309, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %399, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.65, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1318 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1318, align 8, !tbaa !59 %400 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %400, align 1, !tbaa !36 - %401 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !131 - %402 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !131 - %call3.i.i.i1314 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp88, i64 0, i64 0, i8* %402, i64 %401) #7, !noalias !131 + store i8 0, i8* %400, align 1, !tbaa !42 + %401 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !149 + %402 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !149 + %call3.i.i.i1323 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp88, i64 0, i64 0, i8* %402, i64 %401) #2, !noalias !149 %403 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 2 %404 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_b_path to %union.anon** - store %union.anon* %403, %union.anon** %404, align 8, !tbaa !52, !alias.scope !131 - %_M_p.i.i23.i.i1315 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 0, i32 0 - %405 = load i8*, i8** %_M_p.i.i23.i.i1315, align 8, !tbaa !56 - %406 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 2 - %arraydecay.i.i.i.i1316 = bitcast %union.anon* %406 to i8* - %cmp.i.i.i1317 = icmp eq i8* %405, %arraydecay.i.i.i.i1316 - br i1 %cmp.i.i.i1317, label %if.then.i.i1319, label %if.else.i.i1323 - -if.then.i.i1319: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278 - %arraydecay.i.i.i1318 = bitcast %union.anon* %403 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1318, i8* %405, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326 - -if.else.i.i1323: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278 - %_M_p.i21.i.i1320 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 - store i8* %405, i8** %_M_p.i21.i.i1320, align 8, !tbaa !56, !alias.scope !131 - %_M_allocated_capacity.i.i1321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 2, i32 0 - %407 = load i64, i64* %_M_allocated_capacity.i.i1321, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1322 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 2, i32 0 - store i64 %407, i64* %_M_allocated_capacity.i.i.i1322, align 8, !tbaa !12, !alias.scope !131 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326: ; preds = %if.then.i.i1319, %if.else.i.i1323 - %_M_string_length.i20.i.i1324 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 1 - %408 = load i64, i64* %_M_string_length.i20.i.i1324, align 8, !tbaa !53 - %_M_string_length.i.i2.i1325 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 1 - store i64 %408, i64* %_M_string_length.i.i2.i1325, align 8, !tbaa !53, !alias.scope !131 - %409 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1314 to %union.anon** - store %union.anon* %406, %union.anon** %409, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1324, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1316, align 1, !tbaa !36 - %_M_p.i.i.i.i1327 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 0, i32 0 - %410 = load i8*, i8** %_M_p.i.i.i.i1327, align 8, !tbaa !56 - %cmp.i.i.i1329 = icmp eq i8* %410, %399 - br i1 %cmp.i.i.i1329, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331, label %if.then.i.i1330 - -if.then.i.i1330: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326 - call void @_ZdlPv(i8* %410) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326, %if.then.i.i1330 - call void @llvm.lifetime.end(i64 32, i8* nonnull %396) #7 - %_M_p.i.i1332 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 - %411 = load i8*, i8** %_M_p.i.i1332, align 8, !tbaa !56 - %call91 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %411, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %403, %union.anon** %404, align 8, !tbaa !58, !alias.scope !149 + %_M_p.i.i23.i.i1324 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 0, i32 0 + %405 = load i8*, i8** %_M_p.i.i23.i.i1324, align 8, !tbaa !62 + %406 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 2 + %arraydecay.i.i.i.i1325 = bitcast %union.anon* %406 to i8* + %cmp.i.i.i1326 = icmp eq i8* %405, %arraydecay.i.i.i.i1325 + br i1 %cmp.i.i.i1326, label %if.then.i.i1328, label %if.else.i.i1332 + +if.then.i.i1328: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272 + %arraydecay.i.i.i1327 = bitcast %union.anon* %403 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1327, i8* %405, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335 + +if.else.i.i1332: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272 + %_M_p.i21.i.i1329 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 + store i8* %405, i8** %_M_p.i21.i.i1329, align 8, !tbaa !62, !alias.scope !149 + %_M_allocated_capacity.i.i1330 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 2, i32 0 + %407 = load i64, i64* %_M_allocated_capacity.i.i1330, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1331 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 2, i32 0 + store i64 %407, i64* %_M_allocated_capacity.i.i.i1331, align 8, !tbaa !15, !alias.scope !149 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335: ; preds = %if.then.i.i1328, %if.else.i.i1332 + %_M_string_length.i20.i.i1333 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 1 + %408 = load i64, i64* %_M_string_length.i20.i.i1333, align 8, !tbaa !59 + %_M_string_length.i.i2.i1334 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 1 + store i64 %408, i64* %_M_string_length.i.i2.i1334, align 8, !tbaa !59, !alias.scope !149 + %409 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1323 to %union.anon** + store %union.anon* %406, %union.anon** %409, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1333, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1325, align 1, !tbaa !42 + %_M_p.i.i.i.i1336 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 0, i32 0 + %410 = load i8*, i8** %_M_p.i.i.i.i1336, align 8, !tbaa !62 + %cmp.i.i.i1338 = icmp eq i8* %410, %399 + br i1 %cmp.i.i.i1338, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340, label %if.then.i.i1339 + +if.then.i.i1339: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335 + call void @_ZdlPv(i8* %410) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335, %if.then.i.i1339 + call void @llvm.lifetime.end(i64 32, i8* nonnull %396) #2 + %_M_p.i.i1341 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 + %411 = load i8*, i8** %_M_p.i.i1341, align 8, !tbaa !62 + %call91 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %411, i32 0, i64 1, i64 512, i64 1, i64 1) %412 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %412) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %412) #2 %413 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp92 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %413) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %413) #2 %414 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 2 %415 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp92 to %union.anon** - store %union.anon* %414, %union.anon** %415, align 8, !tbaa !52 + store %union.anon* %414, %union.anon** %415, align 8, !tbaa !58 %416 = bitcast %union.anon* %414 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %416, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.48, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1352 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1352, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %416, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.66, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1355, align 8, !tbaa !59 %417 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %417, align 1, !tbaa !36 - %418 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !134 - %419 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !134 - %call3.i.i.i1357 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp92, i64 0, i64 0, i8* %419, i64 %418) #7, !noalias !134 + store i8 0, i8* %417, align 1, !tbaa !42 + %418 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !152 + %419 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !152 + %call3.i.i.i1360 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp92, i64 0, i64 0, i8* %419, i64 %418) #2, !noalias !152 %420 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 2 %421 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_w_path to %union.anon** - store %union.anon* %420, %union.anon** %421, align 8, !tbaa !52, !alias.scope !134 - %_M_p.i.i23.i.i1358 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 0, i32 0 - %422 = load i8*, i8** %_M_p.i.i23.i.i1358, align 8, !tbaa !56 - %423 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 2 - %arraydecay.i.i.i.i1359 = bitcast %union.anon* %423 to i8* - %cmp.i.i.i1360 = icmp eq i8* %422, %arraydecay.i.i.i.i1359 - br i1 %cmp.i.i.i1360, label %if.then.i.i1362, label %if.else.i.i1366 - -if.then.i.i1362: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331 - %arraydecay.i.i.i1361 = bitcast %union.anon* %420 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1361, i8* %422, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369 - -if.else.i.i1366: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331 - %_M_p.i21.i.i1363 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 - store i8* %422, i8** %_M_p.i21.i.i1363, align 8, !tbaa !56, !alias.scope !134 - %_M_allocated_capacity.i.i1364 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 2, i32 0 - %424 = load i64, i64* %_M_allocated_capacity.i.i1364, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1365 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 2, i32 0 - store i64 %424, i64* %_M_allocated_capacity.i.i.i1365, align 8, !tbaa !12, !alias.scope !134 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369: ; preds = %if.then.i.i1362, %if.else.i.i1366 - %_M_string_length.i20.i.i1367 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 1 - %425 = load i64, i64* %_M_string_length.i20.i.i1367, align 8, !tbaa !53 - %_M_string_length.i.i2.i1368 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 1 - store i64 %425, i64* %_M_string_length.i.i2.i1368, align 8, !tbaa !53, !alias.scope !134 - %426 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1357 to %union.anon** - store %union.anon* %423, %union.anon** %426, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1367, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1359, align 1, !tbaa !36 - %_M_p.i.i.i.i1370 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 0, i32 0 - %427 = load i8*, i8** %_M_p.i.i.i.i1370, align 8, !tbaa !56 - %cmp.i.i.i1372 = icmp eq i8* %427, %416 - br i1 %cmp.i.i.i1372, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374, label %if.then.i.i1373 - -if.then.i.i1373: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369 - call void @_ZdlPv(i8* %427) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369, %if.then.i.i1373 - call void @llvm.lifetime.end(i64 32, i8* nonnull %413) #7 - %_M_p.i.i1375 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 - %428 = load i8*, i8** %_M_p.i.i1375, align 8, !tbaa !56 - %call95 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %428, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %420, %union.anon** %421, align 8, !tbaa !58, !alias.scope !152 + %_M_p.i.i23.i.i1361 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 0, i32 0 + %422 = load i8*, i8** %_M_p.i.i23.i.i1361, align 8, !tbaa !62 + %423 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 2 + %arraydecay.i.i.i.i1362 = bitcast %union.anon* %423 to i8* + %cmp.i.i.i1363 = icmp eq i8* %422, %arraydecay.i.i.i.i1362 + br i1 %cmp.i.i.i1363, label %if.then.i.i1365, label %if.else.i.i1369 + +if.then.i.i1365: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340 + %arraydecay.i.i.i1364 = bitcast %union.anon* %420 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1364, i8* %422, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372 + +if.else.i.i1369: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340 + %_M_p.i21.i.i1366 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 + store i8* %422, i8** %_M_p.i21.i.i1366, align 8, !tbaa !62, !alias.scope !152 + %_M_allocated_capacity.i.i1367 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 2, i32 0 + %424 = load i64, i64* %_M_allocated_capacity.i.i1367, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1368 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 2, i32 0 + store i64 %424, i64* %_M_allocated_capacity.i.i.i1368, align 8, !tbaa !15, !alias.scope !152 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372: ; preds = %if.then.i.i1365, %if.else.i.i1369 + %_M_string_length.i20.i.i1370 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 1 + %425 = load i64, i64* %_M_string_length.i20.i.i1370, align 8, !tbaa !59 + %_M_string_length.i.i2.i1371 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 1 + store i64 %425, i64* %_M_string_length.i.i2.i1371, align 8, !tbaa !59, !alias.scope !152 + %426 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1360 to %union.anon** + store %union.anon* %423, %union.anon** %426, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1370, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1362, align 1, !tbaa !42 + %_M_p.i.i.i.i1373 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 0, i32 0 + %427 = load i8*, i8** %_M_p.i.i.i.i1373, align 8, !tbaa !62 + %cmp.i.i.i1375 = icmp eq i8* %427, %416 + br i1 %cmp.i.i.i1375, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377, label %if.then.i.i1376 + +if.then.i.i1376: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372 + call void @_ZdlPv(i8* %427) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372, %if.then.i.i1376 + call void @llvm.lifetime.end(i64 32, i8* nonnull %413) #2 + %_M_p.i.i1378 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 + %428 = load i8*, i8** %_M_p.i.i1378, align 8, !tbaa !62 + %call95 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %428, i32 0, i64 512, i64 512, i64 3, i64 3) %429 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %429) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %429) #2 %430 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp96 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %430) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %430) #2 %431 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 2 %432 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp96 to %union.anon** - store %union.anon* %431, %union.anon** %432, align 8, !tbaa !52 + store %union.anon* %431, %union.anon** %432, align 8, !tbaa !58 %433 = bitcast %union.anon* %431 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %433, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.49, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1420 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1420, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %433, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.67, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1413 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1413, align 8, !tbaa !59 %434 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %434, align 1, !tbaa !36 - %435 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !137 - %436 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !137 - %call3.i.i.i1425 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp96, i64 0, i64 0, i8* %436, i64 %435) #7, !noalias !137 + store i8 0, i8* %434, align 1, !tbaa !42 + %435 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !155 + %436 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !155 + %call3.i.i.i1418 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp96, i64 0, i64 0, i8* %436, i64 %435) #2, !noalias !155 %437 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 2 %438 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_b_path to %union.anon** - store %union.anon* %437, %union.anon** %438, align 8, !tbaa !52, !alias.scope !137 - %_M_p.i.i23.i.i1426 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 0, i32 0 - %439 = load i8*, i8** %_M_p.i.i23.i.i1426, align 8, !tbaa !56 - %440 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 2 - %arraydecay.i.i.i.i1427 = bitcast %union.anon* %440 to i8* - %cmp.i.i.i1428 = icmp eq i8* %439, %arraydecay.i.i.i.i1427 - br i1 %cmp.i.i.i1428, label %if.then.i.i1430, label %if.else.i.i1434 - -if.then.i.i1430: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374 - %arraydecay.i.i.i1429 = bitcast %union.anon* %437 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1429, i8* %439, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437 - -if.else.i.i1434: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374 - %_M_p.i21.i.i1431 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 - store i8* %439, i8** %_M_p.i21.i.i1431, align 8, !tbaa !56, !alias.scope !137 - %_M_allocated_capacity.i.i1432 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 2, i32 0 - %441 = load i64, i64* %_M_allocated_capacity.i.i1432, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1433 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 2, i32 0 - store i64 %441, i64* %_M_allocated_capacity.i.i.i1433, align 8, !tbaa !12, !alias.scope !137 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437: ; preds = %if.then.i.i1430, %if.else.i.i1434 - %_M_string_length.i20.i.i1435 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 1 - %442 = load i64, i64* %_M_string_length.i20.i.i1435, align 8, !tbaa !53 - %_M_string_length.i.i2.i1436 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 1 - store i64 %442, i64* %_M_string_length.i.i2.i1436, align 8, !tbaa !53, !alias.scope !137 - %443 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1425 to %union.anon** - store %union.anon* %440, %union.anon** %443, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1435, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1427, align 1, !tbaa !36 - %_M_p.i.i.i.i1438 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 0, i32 0 - %444 = load i8*, i8** %_M_p.i.i.i.i1438, align 8, !tbaa !56 - %cmp.i.i.i1440 = icmp eq i8* %444, %433 - br i1 %cmp.i.i.i1440, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442, label %if.then.i.i1441 - -if.then.i.i1441: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437 - call void @_ZdlPv(i8* %444) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437, %if.then.i.i1441 - call void @llvm.lifetime.end(i64 32, i8* nonnull %430) #7 - %_M_p.i.i1443 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 - %445 = load i8*, i8** %_M_p.i.i1443, align 8, !tbaa !56 - %call99 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %445, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %437, %union.anon** %438, align 8, !tbaa !58, !alias.scope !155 + %_M_p.i.i23.i.i1419 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 0, i32 0 + %439 = load i8*, i8** %_M_p.i.i23.i.i1419, align 8, !tbaa !62 + %440 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 2 + %arraydecay.i.i.i.i1420 = bitcast %union.anon* %440 to i8* + %cmp.i.i.i1421 = icmp eq i8* %439, %arraydecay.i.i.i.i1420 + br i1 %cmp.i.i.i1421, label %if.then.i.i1423, label %if.else.i.i1427 + +if.then.i.i1423: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377 + %arraydecay.i.i.i1422 = bitcast %union.anon* %437 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1422, i8* %439, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430 + +if.else.i.i1427: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377 + %_M_p.i21.i.i1424 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 + store i8* %439, i8** %_M_p.i21.i.i1424, align 8, !tbaa !62, !alias.scope !155 + %_M_allocated_capacity.i.i1425 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 2, i32 0 + %441 = load i64, i64* %_M_allocated_capacity.i.i1425, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1426 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 2, i32 0 + store i64 %441, i64* %_M_allocated_capacity.i.i.i1426, align 8, !tbaa !15, !alias.scope !155 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430: ; preds = %if.then.i.i1423, %if.else.i.i1427 + %_M_string_length.i20.i.i1428 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 1 + %442 = load i64, i64* %_M_string_length.i20.i.i1428, align 8, !tbaa !59 + %_M_string_length.i.i2.i1429 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 1 + store i64 %442, i64* %_M_string_length.i.i2.i1429, align 8, !tbaa !59, !alias.scope !155 + %443 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1418 to %union.anon** + store %union.anon* %440, %union.anon** %443, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1428, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1420, align 1, !tbaa !42 + %_M_p.i.i.i.i1431 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 0, i32 0 + %444 = load i8*, i8** %_M_p.i.i.i.i1431, align 8, !tbaa !62 + %cmp.i.i.i1433 = icmp eq i8* %444, %433 + br i1 %cmp.i.i.i1433, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435, label %if.then.i.i1434 + +if.then.i.i1434: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430 + call void @_ZdlPv(i8* %444) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430, %if.then.i.i1434 + call void @llvm.lifetime.end(i64 32, i8* nonnull %430) #2 + %_M_p.i.i1436 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 + %445 = load i8*, i8** %_M_p.i.i1436, align 8, !tbaa !62 + %call99 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %445, i32 0, i64 1, i64 512, i64 1, i64 1) %446 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %446) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %446) #2 %447 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp100 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %447) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %447) #2 %448 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 2 %449 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp100 to %union.anon** - store %union.anon* %448, %union.anon** %449, align 8, !tbaa !52 + store %union.anon* %448, %union.anon** %449, align 8, !tbaa !58 %450 = bitcast %union.anon* %448 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %450, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.50, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1457, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %450, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.68, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1466 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1466, align 8, !tbaa !59 %451 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %451, align 1, !tbaa !36 - %452 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !140 - %453 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !140 - %call3.i.i.i1462 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp100, i64 0, i64 0, i8* %453, i64 %452) #7, !noalias !140 + store i8 0, i8* %451, align 1, !tbaa !42 + %452 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !158 + %453 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !158 + %call3.i.i.i1471 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp100, i64 0, i64 0, i8* %453, i64 %452) #2, !noalias !158 %454 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 2 %455 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_w_path to %union.anon** - store %union.anon* %454, %union.anon** %455, align 8, !tbaa !52, !alias.scope !140 - %_M_p.i.i23.i.i1463 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 0, i32 0 - %456 = load i8*, i8** %_M_p.i.i23.i.i1463, align 8, !tbaa !56 - %457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 2 - %arraydecay.i.i.i.i1464 = bitcast %union.anon* %457 to i8* - %cmp.i.i.i1465 = icmp eq i8* %456, %arraydecay.i.i.i.i1464 - br i1 %cmp.i.i.i1465, label %if.then.i.i1467, label %if.else.i.i1471 - -if.then.i.i1467: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442 - %arraydecay.i.i.i1466 = bitcast %union.anon* %454 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1466, i8* %456, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474 - -if.else.i.i1471: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442 - %_M_p.i21.i.i1468 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 - store i8* %456, i8** %_M_p.i21.i.i1468, align 8, !tbaa !56, !alias.scope !140 - %_M_allocated_capacity.i.i1469 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 2, i32 0 - %458 = load i64, i64* %_M_allocated_capacity.i.i1469, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1470 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 2, i32 0 - store i64 %458, i64* %_M_allocated_capacity.i.i.i1470, align 8, !tbaa !12, !alias.scope !140 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474: ; preds = %if.then.i.i1467, %if.else.i.i1471 - %_M_string_length.i20.i.i1472 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 1 - %459 = load i64, i64* %_M_string_length.i20.i.i1472, align 8, !tbaa !53 - %_M_string_length.i.i2.i1473 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 1 - store i64 %459, i64* %_M_string_length.i.i2.i1473, align 8, !tbaa !53, !alias.scope !140 - %460 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1462 to %union.anon** - store %union.anon* %457, %union.anon** %460, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1472, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1464, align 1, !tbaa !36 - %_M_p.i.i.i.i1475 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 0, i32 0 - %461 = load i8*, i8** %_M_p.i.i.i.i1475, align 8, !tbaa !56 - %cmp.i.i.i1477 = icmp eq i8* %461, %450 - br i1 %cmp.i.i.i1477, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479, label %if.then.i.i1478 - -if.then.i.i1478: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474 - call void @_ZdlPv(i8* %461) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474, %if.then.i.i1478 - call void @llvm.lifetime.end(i64 32, i8* nonnull %447) #7 - %_M_p.i.i1480 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 - %462 = load i8*, i8** %_M_p.i.i1480, align 8, !tbaa !56 - %call103 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %462, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %454, %union.anon** %455, align 8, !tbaa !58, !alias.scope !158 + %_M_p.i.i23.i.i1472 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 0, i32 0 + %456 = load i8*, i8** %_M_p.i.i23.i.i1472, align 8, !tbaa !62 + %457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 2 + %arraydecay.i.i.i.i1473 = bitcast %union.anon* %457 to i8* + %cmp.i.i.i1474 = icmp eq i8* %456, %arraydecay.i.i.i.i1473 + br i1 %cmp.i.i.i1474, label %if.then.i.i1476, label %if.else.i.i1480 + +if.then.i.i1476: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435 + %arraydecay.i.i.i1475 = bitcast %union.anon* %454 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1475, i8* %456, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483 + +if.else.i.i1480: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435 + %_M_p.i21.i.i1477 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 + store i8* %456, i8** %_M_p.i21.i.i1477, align 8, !tbaa !62, !alias.scope !158 + %_M_allocated_capacity.i.i1478 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 2, i32 0 + %458 = load i64, i64* %_M_allocated_capacity.i.i1478, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1479 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 2, i32 0 + store i64 %458, i64* %_M_allocated_capacity.i.i.i1479, align 8, !tbaa !15, !alias.scope !158 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483: ; preds = %if.then.i.i1476, %if.else.i.i1480 + %_M_string_length.i20.i.i1481 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 1 + %459 = load i64, i64* %_M_string_length.i20.i.i1481, align 8, !tbaa !59 + %_M_string_length.i.i2.i1482 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 1 + store i64 %459, i64* %_M_string_length.i.i2.i1482, align 8, !tbaa !59, !alias.scope !158 + %460 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1471 to %union.anon** + store %union.anon* %457, %union.anon** %460, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1481, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1473, align 1, !tbaa !42 + %_M_p.i.i.i.i1484 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 0, i32 0 + %461 = load i8*, i8** %_M_p.i.i.i.i1484, align 8, !tbaa !62 + %cmp.i.i.i1486 = icmp eq i8* %461, %450 + br i1 %cmp.i.i.i1486, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488, label %if.then.i.i1487 + +if.then.i.i1487: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483 + call void @_ZdlPv(i8* %461) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483, %if.then.i.i1487 + call void @llvm.lifetime.end(i64 32, i8* nonnull %447) #2 + %_M_p.i.i1489 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 + %462 = load i8*, i8** %_M_p.i.i1489, align 8, !tbaa !62 + %call103 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %462, i32 0, i64 512, i64 512, i64 3, i64 3) %463 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %463) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %463) #2 %464 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp104 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %464) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %464) #2 %465 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 2 %466 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp104 to %union.anon** - store %union.anon* %465, %union.anon** %466, align 8, !tbaa !52 + store %union.anon* %465, %union.anon** %466, align 8, !tbaa !58 %467 = bitcast %union.anon* %465 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %467, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.51, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1515 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1515, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %467, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.69, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1509 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1509, align 8, !tbaa !59 %468 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %468, align 1, !tbaa !36 - %469 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !143 - %470 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !143 - %call3.i.i.i1520 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp104, i64 0, i64 0, i8* %470, i64 %469) #7, !noalias !143 + store i8 0, i8* %468, align 1, !tbaa !42 + %469 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !161 + %470 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !161 + %call3.i.i.i1514 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp104, i64 0, i64 0, i8* %470, i64 %469) #2, !noalias !161 %471 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 2 %472 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_b_path to %union.anon** - store %union.anon* %471, %union.anon** %472, align 8, !tbaa !52, !alias.scope !143 - %_M_p.i.i23.i.i1521 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 0, i32 0 - %473 = load i8*, i8** %_M_p.i.i23.i.i1521, align 8, !tbaa !56 - %474 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 2 - %arraydecay.i.i.i.i1522 = bitcast %union.anon* %474 to i8* - %cmp.i.i.i1523 = icmp eq i8* %473, %arraydecay.i.i.i.i1522 - br i1 %cmp.i.i.i1523, label %if.then.i.i1525, label %if.else.i.i1529 - -if.then.i.i1525: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479 - %arraydecay.i.i.i1524 = bitcast %union.anon* %471 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1524, i8* %473, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532 - -if.else.i.i1529: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479 - %_M_p.i21.i.i1526 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 - store i8* %473, i8** %_M_p.i21.i.i1526, align 8, !tbaa !56, !alias.scope !143 - %_M_allocated_capacity.i.i1527 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 2, i32 0 - %475 = load i64, i64* %_M_allocated_capacity.i.i1527, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1528 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 2, i32 0 - store i64 %475, i64* %_M_allocated_capacity.i.i.i1528, align 8, !tbaa !12, !alias.scope !143 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532: ; preds = %if.then.i.i1525, %if.else.i.i1529 - %_M_string_length.i20.i.i1530 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 1 - %476 = load i64, i64* %_M_string_length.i20.i.i1530, align 8, !tbaa !53 - %_M_string_length.i.i2.i1531 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 1 - store i64 %476, i64* %_M_string_length.i.i2.i1531, align 8, !tbaa !53, !alias.scope !143 - %477 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1520 to %union.anon** - store %union.anon* %474, %union.anon** %477, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1530, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1522, align 1, !tbaa !36 - %_M_p.i.i.i.i1533 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 0, i32 0 - %478 = load i8*, i8** %_M_p.i.i.i.i1533, align 8, !tbaa !56 - %cmp.i.i.i1535 = icmp eq i8* %478, %467 - br i1 %cmp.i.i.i1535, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537, label %if.then.i.i1536 - -if.then.i.i1536: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532 - call void @_ZdlPv(i8* %478) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532, %if.then.i.i1536 - call void @llvm.lifetime.end(i64 32, i8* nonnull %464) #7 - %_M_p.i.i1538 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 - %479 = load i8*, i8** %_M_p.i.i1538, align 8, !tbaa !56 - %call107 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %479, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %471, %union.anon** %472, align 8, !tbaa !58, !alias.scope !161 + %_M_p.i.i23.i.i1515 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 0, i32 0 + %473 = load i8*, i8** %_M_p.i.i23.i.i1515, align 8, !tbaa !62 + %474 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 2 + %arraydecay.i.i.i.i1516 = bitcast %union.anon* %474 to i8* + %cmp.i.i.i1517 = icmp eq i8* %473, %arraydecay.i.i.i.i1516 + br i1 %cmp.i.i.i1517, label %if.then.i.i1519, label %if.else.i.i1523 + +if.then.i.i1519: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488 + %arraydecay.i.i.i1518 = bitcast %union.anon* %471 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1518, i8* %473, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526 + +if.else.i.i1523: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488 + %_M_p.i21.i.i1520 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 + store i8* %473, i8** %_M_p.i21.i.i1520, align 8, !tbaa !62, !alias.scope !161 + %_M_allocated_capacity.i.i1521 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 2, i32 0 + %475 = load i64, i64* %_M_allocated_capacity.i.i1521, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1522 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 2, i32 0 + store i64 %475, i64* %_M_allocated_capacity.i.i.i1522, align 8, !tbaa !15, !alias.scope !161 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526: ; preds = %if.then.i.i1519, %if.else.i.i1523 + %_M_string_length.i20.i.i1524 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 1 + %476 = load i64, i64* %_M_string_length.i20.i.i1524, align 8, !tbaa !59 + %_M_string_length.i.i2.i1525 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 1 + store i64 %476, i64* %_M_string_length.i.i2.i1525, align 8, !tbaa !59, !alias.scope !161 + %477 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1514 to %union.anon** + store %union.anon* %474, %union.anon** %477, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1524, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1516, align 1, !tbaa !42 + %_M_p.i.i.i.i1527 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 0, i32 0 + %478 = load i8*, i8** %_M_p.i.i.i.i1527, align 8, !tbaa !62 + %cmp.i.i.i1529 = icmp eq i8* %478, %467 + br i1 %cmp.i.i.i1529, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531, label %if.then.i.i1530 + +if.then.i.i1530: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526 + call void @_ZdlPv(i8* %478) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526, %if.then.i.i1530 + call void @llvm.lifetime.end(i64 32, i8* nonnull %464) #2 + %_M_p.i.i1532 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 + %479 = load i8*, i8** %_M_p.i.i1532, align 8, !tbaa !62 + %call107 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %479, i32 0, i64 1, i64 512, i64 1, i64 1) %480 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %480) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %480) #2 %481 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp108 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %481) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %481) #2 %482 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 2 %483 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp108 to %union.anon** - store %union.anon* %482, %union.anon** %483, align 8, !tbaa !52 + store %union.anon* %482, %union.anon** %483, align 8, !tbaa !58 %484 = bitcast %union.anon* %482 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %484, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.52, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1568 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1568, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %484, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.70, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1577 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1577, align 8, !tbaa !59 %485 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %485, align 1, !tbaa !36 - %486 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !146 - %487 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !146 - %call3.i.i.i1573 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp108, i64 0, i64 0, i8* %487, i64 %486) #7, !noalias !146 + store i8 0, i8* %485, align 1, !tbaa !42 + %486 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !164 + %487 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !164 + %call3.i.i.i1582 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp108, i64 0, i64 0, i8* %487, i64 %486) #2, !noalias !164 %488 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 2 %489 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_w_path to %union.anon** - store %union.anon* %488, %union.anon** %489, align 8, !tbaa !52, !alias.scope !146 - %_M_p.i.i23.i.i1574 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 0, i32 0 - %490 = load i8*, i8** %_M_p.i.i23.i.i1574, align 8, !tbaa !56 - %491 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 2 - %arraydecay.i.i.i.i1575 = bitcast %union.anon* %491 to i8* - %cmp.i.i.i1576 = icmp eq i8* %490, %arraydecay.i.i.i.i1575 - br i1 %cmp.i.i.i1576, label %if.then.i.i1578, label %if.else.i.i1582 - -if.then.i.i1578: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537 - %arraydecay.i.i.i1577 = bitcast %union.anon* %488 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1577, i8* %490, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585 - -if.else.i.i1582: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537 - %_M_p.i21.i.i1579 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 - store i8* %490, i8** %_M_p.i21.i.i1579, align 8, !tbaa !56, !alias.scope !146 - %_M_allocated_capacity.i.i1580 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 2, i32 0 - %492 = load i64, i64* %_M_allocated_capacity.i.i1580, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1581 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 2, i32 0 - store i64 %492, i64* %_M_allocated_capacity.i.i.i1581, align 8, !tbaa !12, !alias.scope !146 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585: ; preds = %if.then.i.i1578, %if.else.i.i1582 - %_M_string_length.i20.i.i1583 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 1 - %493 = load i64, i64* %_M_string_length.i20.i.i1583, align 8, !tbaa !53 - %_M_string_length.i.i2.i1584 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 1 - store i64 %493, i64* %_M_string_length.i.i2.i1584, align 8, !tbaa !53, !alias.scope !146 - %494 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1573 to %union.anon** - store %union.anon* %491, %union.anon** %494, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1583, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1575, align 1, !tbaa !36 - %_M_p.i.i.i.i1586 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 0, i32 0 - %495 = load i8*, i8** %_M_p.i.i.i.i1586, align 8, !tbaa !56 - %cmp.i.i.i1588 = icmp eq i8* %495, %484 - br i1 %cmp.i.i.i1588, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590, label %if.then.i.i1589 - -if.then.i.i1589: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585 - call void @_ZdlPv(i8* %495) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585, %if.then.i.i1589 - call void @llvm.lifetime.end(i64 32, i8* nonnull %481) #7 - %_M_p.i.i1591 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 - %496 = load i8*, i8** %_M_p.i.i1591, align 8, !tbaa !56 - %call111 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %496, i32 0, i32 1, i32 1, i32 512, i32 512) + store %union.anon* %488, %union.anon** %489, align 8, !tbaa !58, !alias.scope !164 + %_M_p.i.i23.i.i1583 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 0, i32 0 + %490 = load i8*, i8** %_M_p.i.i23.i.i1583, align 8, !tbaa !62 + %491 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 2 + %arraydecay.i.i.i.i1584 = bitcast %union.anon* %491 to i8* + %cmp.i.i.i1585 = icmp eq i8* %490, %arraydecay.i.i.i.i1584 + br i1 %cmp.i.i.i1585, label %if.then.i.i1587, label %if.else.i.i1591 + +if.then.i.i1587: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531 + %arraydecay.i.i.i1586 = bitcast %union.anon* %488 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1586, i8* %490, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594 + +if.else.i.i1591: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531 + %_M_p.i21.i.i1588 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 + store i8* %490, i8** %_M_p.i21.i.i1588, align 8, !tbaa !62, !alias.scope !164 + %_M_allocated_capacity.i.i1589 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 2, i32 0 + %492 = load i64, i64* %_M_allocated_capacity.i.i1589, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1590 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 2, i32 0 + store i64 %492, i64* %_M_allocated_capacity.i.i.i1590, align 8, !tbaa !15, !alias.scope !164 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594: ; preds = %if.then.i.i1587, %if.else.i.i1591 + %_M_string_length.i20.i.i1592 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 1 + %493 = load i64, i64* %_M_string_length.i20.i.i1592, align 8, !tbaa !59 + %_M_string_length.i.i2.i1593 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 1 + store i64 %493, i64* %_M_string_length.i.i2.i1593, align 8, !tbaa !59, !alias.scope !164 + %494 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1582 to %union.anon** + store %union.anon* %491, %union.anon** %494, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1592, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1584, align 1, !tbaa !42 + %_M_p.i.i.i.i1595 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 0, i32 0 + %495 = load i8*, i8** %_M_p.i.i.i.i1595, align 8, !tbaa !62 + %cmp.i.i.i1597 = icmp eq i8* %495, %484 + br i1 %cmp.i.i.i1597, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599, label %if.then.i.i1598 + +if.then.i.i1598: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594 + call void @_ZdlPv(i8* %495) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594, %if.then.i.i1598 + call void @llvm.lifetime.end(i64 32, i8* nonnull %481) #2 + %_M_p.i.i1600 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 + %496 = load i8*, i8** %_M_p.i.i1600, align 8, !tbaa !62 + %call111 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %496, i32 0, i64 1, i64 1, i64 512, i64 512) %497 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %497) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %497) #2 %498 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp112 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %498) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %498) #2 %499 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 2 %500 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp112 to %union.anon** - store %union.anon* %499, %union.anon** %500, align 8, !tbaa !52 + store %union.anon* %499, %union.anon** %500, align 8, !tbaa !58 %501 = bitcast %union.anon* %499 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %501, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.53, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1552 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1552, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %501, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.71, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1561 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1561, align 8, !tbaa !59 %502 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %502, align 1, !tbaa !36 - %503 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !149 - %504 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !149 - %call3.i.i.i1489 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp112, i64 0, i64 0, i8* %504, i64 %503) #7, !noalias !149 + store i8 0, i8* %502, align 1, !tbaa !42 + %503 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !167 + %504 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !167 + %call3.i.i.i1535 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp112, i64 0, i64 0, i8* %504, i64 %503) #2, !noalias !167 %505 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 2 %506 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_b_path to %union.anon** - store %union.anon* %505, %union.anon** %506, align 8, !tbaa !52, !alias.scope !149 - %_M_p.i.i23.i.i1490 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 0, i32 0 - %507 = load i8*, i8** %_M_p.i.i23.i.i1490, align 8, !tbaa !56 - %508 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 2 - %arraydecay.i.i.i.i1491 = bitcast %union.anon* %508 to i8* - %cmp.i.i.i1492 = icmp eq i8* %507, %arraydecay.i.i.i.i1491 - br i1 %cmp.i.i.i1492, label %if.then.i.i1494, label %if.else.i.i1498 - -if.then.i.i1494: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590 - %arraydecay.i.i.i1493 = bitcast %union.anon* %505 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1493, i8* %507, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501 - -if.else.i.i1498: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590 - %_M_p.i21.i.i1495 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 - store i8* %507, i8** %_M_p.i21.i.i1495, align 8, !tbaa !56, !alias.scope !149 - %_M_allocated_capacity.i.i1496 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 2, i32 0 - %509 = load i64, i64* %_M_allocated_capacity.i.i1496, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1497 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 2, i32 0 - store i64 %509, i64* %_M_allocated_capacity.i.i.i1497, align 8, !tbaa !12, !alias.scope !149 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501: ; preds = %if.then.i.i1494, %if.else.i.i1498 - %_M_string_length.i20.i.i1499 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 1 - %510 = load i64, i64* %_M_string_length.i20.i.i1499, align 8, !tbaa !53 - %_M_string_length.i.i2.i1500 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 1 - store i64 %510, i64* %_M_string_length.i.i2.i1500, align 8, !tbaa !53, !alias.scope !149 - %511 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1489 to %union.anon** - store %union.anon* %508, %union.anon** %511, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1499, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1491, align 1, !tbaa !36 - %_M_p.i.i.i.i1482 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 0, i32 0 - %512 = load i8*, i8** %_M_p.i.i.i.i1482, align 8, !tbaa !56 - %cmp.i.i.i1484 = icmp eq i8* %512, %501 - br i1 %cmp.i.i.i1484, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486, label %if.then.i.i1485 - -if.then.i.i1485: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501 - call void @_ZdlPv(i8* %512) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501, %if.then.i.i1485 - call void @llvm.lifetime.end(i64 32, i8* nonnull %498) #7 - %_M_p.i.i1481 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 - %513 = load i8*, i8** %_M_p.i.i1481, align 8, !tbaa !56 - %call115 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %513, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %505, %union.anon** %506, align 8, !tbaa !58, !alias.scope !167 + %_M_p.i.i23.i.i1536 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 0, i32 0 + %507 = load i8*, i8** %_M_p.i.i23.i.i1536, align 8, !tbaa !62 + %508 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 2 + %arraydecay.i.i.i.i1537 = bitcast %union.anon* %508 to i8* + %cmp.i.i.i1538 = icmp eq i8* %507, %arraydecay.i.i.i.i1537 + br i1 %cmp.i.i.i1538, label %if.then.i.i1540, label %if.else.i.i1544 + +if.then.i.i1540: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599 + %arraydecay.i.i.i1539 = bitcast %union.anon* %505 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1539, i8* %507, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547 + +if.else.i.i1544: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599 + %_M_p.i21.i.i1541 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 + store i8* %507, i8** %_M_p.i21.i.i1541, align 8, !tbaa !62, !alias.scope !167 + %_M_allocated_capacity.i.i1542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 2, i32 0 + %509 = load i64, i64* %_M_allocated_capacity.i.i1542, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1543 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 2, i32 0 + store i64 %509, i64* %_M_allocated_capacity.i.i.i1543, align 8, !tbaa !15, !alias.scope !167 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547: ; preds = %if.then.i.i1540, %if.else.i.i1544 + %_M_string_length.i20.i.i1545 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 1 + %510 = load i64, i64* %_M_string_length.i20.i.i1545, align 8, !tbaa !59 + %_M_string_length.i.i2.i1546 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 1 + store i64 %510, i64* %_M_string_length.i.i2.i1546, align 8, !tbaa !59, !alias.scope !167 + %511 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1535 to %union.anon** + store %union.anon* %508, %union.anon** %511, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1545, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1537, align 1, !tbaa !42 + %_M_p.i.i.i.i1491 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 0, i32 0 + %512 = load i8*, i8** %_M_p.i.i.i.i1491, align 8, !tbaa !62 + %cmp.i.i.i1493 = icmp eq i8* %512, %501 + br i1 %cmp.i.i.i1493, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495, label %if.then.i.i1494 + +if.then.i.i1494: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547 + call void @_ZdlPv(i8* %512) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547, %if.then.i.i1494 + call void @llvm.lifetime.end(i64 32, i8* nonnull %498) #2 + %_M_p.i.i1490 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 + %513 = load i8*, i8** %_M_p.i.i1490, align 8, !tbaa !62 + %call115 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %513, i32 0, i64 1, i64 512, i64 1, i64 1) %514 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %514) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %514) #2 %515 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp116 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %515) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %515) #2 %516 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 2 %517 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp116 to %union.anon** - store %union.anon* %516, %union.anon** %517, align 8, !tbaa !52 + store %union.anon* %516, %union.anon** %517, align 8, !tbaa !58 %518 = bitcast %union.anon* %516 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %518, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.54, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1404 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1404, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %518, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.72, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1450 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1450, align 8, !tbaa !59 %519 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %519, align 1, !tbaa !36 - %520 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !152 - %521 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !152 - %call3.i.i.i1378 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp116, i64 0, i64 0, i8* %521, i64 %520) #7, !noalias !152 + store i8 0, i8* %519, align 1, !tbaa !42 + %520 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !170 + %521 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !170 + %call3.i.i.i1387 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp116, i64 0, i64 0, i8* %521, i64 %520) #2, !noalias !170 %522 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 2 %523 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_w_path to %union.anon** - store %union.anon* %522, %union.anon** %523, align 8, !tbaa !52, !alias.scope !152 - %_M_p.i.i23.i.i1379 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 0, i32 0 - %524 = load i8*, i8** %_M_p.i.i23.i.i1379, align 8, !tbaa !56 - %525 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 2 - %arraydecay.i.i.i.i1380 = bitcast %union.anon* %525 to i8* - %cmp.i.i.i1381 = icmp eq i8* %524, %arraydecay.i.i.i.i1380 - br i1 %cmp.i.i.i1381, label %if.then.i.i1383, label %if.else.i.i1387 - -if.then.i.i1383: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486 - %arraydecay.i.i.i1382 = bitcast %union.anon* %522 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1382, i8* %524, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390 - -if.else.i.i1387: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486 - %_M_p.i21.i.i1384 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 - store i8* %524, i8** %_M_p.i21.i.i1384, align 8, !tbaa !56, !alias.scope !152 - %_M_allocated_capacity.i.i1385 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 2, i32 0 - %526 = load i64, i64* %_M_allocated_capacity.i.i1385, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1386 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 2, i32 0 - store i64 %526, i64* %_M_allocated_capacity.i.i.i1386, align 8, !tbaa !12, !alias.scope !152 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390: ; preds = %if.then.i.i1383, %if.else.i.i1387 - %_M_string_length.i20.i.i1388 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 1 - %527 = load i64, i64* %_M_string_length.i20.i.i1388, align 8, !tbaa !53 - %_M_string_length.i.i2.i1389 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 1 - store i64 %527, i64* %_M_string_length.i.i2.i1389, align 8, !tbaa !53, !alias.scope !152 - %528 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1378 to %union.anon** - store %union.anon* %525, %union.anon** %528, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1388, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1380, align 1, !tbaa !36 - %_M_p.i.i.i.i1334 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 0, i32 0 - %529 = load i8*, i8** %_M_p.i.i.i.i1334, align 8, !tbaa !56 - %cmp.i.i.i1336 = icmp eq i8* %529, %518 - br i1 %cmp.i.i.i1336, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338, label %if.then.i.i1337 - -if.then.i.i1337: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390 - call void @_ZdlPv(i8* %529) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390, %if.then.i.i1337 - call void @llvm.lifetime.end(i64 32, i8* nonnull %515) #7 - %_M_p.i.i1333 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 - %530 = load i8*, i8** %_M_p.i.i1333, align 8, !tbaa !56 - %call119 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %530, i32 0, i32 1, i32 1, i32 512, i32 10) + store %union.anon* %522, %union.anon** %523, align 8, !tbaa !58, !alias.scope !170 + %_M_p.i.i23.i.i1388 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 0, i32 0 + %524 = load i8*, i8** %_M_p.i.i23.i.i1388, align 8, !tbaa !62 + %525 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 2 + %arraydecay.i.i.i.i1389 = bitcast %union.anon* %525 to i8* + %cmp.i.i.i1390 = icmp eq i8* %524, %arraydecay.i.i.i.i1389 + br i1 %cmp.i.i.i1390, label %if.then.i.i1392, label %if.else.i.i1396 + +if.then.i.i1392: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495 + %arraydecay.i.i.i1391 = bitcast %union.anon* %522 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1391, i8* %524, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399 + +if.else.i.i1396: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495 + %_M_p.i21.i.i1393 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 + store i8* %524, i8** %_M_p.i21.i.i1393, align 8, !tbaa !62, !alias.scope !170 + %_M_allocated_capacity.i.i1394 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 2, i32 0 + %526 = load i64, i64* %_M_allocated_capacity.i.i1394, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1395 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 2, i32 0 + store i64 %526, i64* %_M_allocated_capacity.i.i.i1395, align 8, !tbaa !15, !alias.scope !170 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399: ; preds = %if.then.i.i1392, %if.else.i.i1396 + %_M_string_length.i20.i.i1397 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 1 + %527 = load i64, i64* %_M_string_length.i20.i.i1397, align 8, !tbaa !59 + %_M_string_length.i.i2.i1398 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 1 + store i64 %527, i64* %_M_string_length.i.i2.i1398, align 8, !tbaa !59, !alias.scope !170 + %528 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1387 to %union.anon** + store %union.anon* %525, %union.anon** %528, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1397, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1389, align 1, !tbaa !42 + %_M_p.i.i.i.i1380 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 0, i32 0 + %529 = load i8*, i8** %_M_p.i.i.i.i1380, align 8, !tbaa !62 + %cmp.i.i.i1382 = icmp eq i8* %529, %518 + br i1 %cmp.i.i.i1382, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384, label %if.then.i.i1383 + +if.then.i.i1383: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399 + call void @_ZdlPv(i8* %529) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399, %if.then.i.i1383 + call void @llvm.lifetime.end(i64 32, i8* nonnull %515) #2 + %_M_p.i.i1379 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 + %530 = load i8*, i8** %_M_p.i.i1379, align 8, !tbaa !62 + %call119 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %530, i32 0, i64 1, i64 1, i64 512, i64 10) %531 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %531) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %531) #2 %532 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp120 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %532) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %532) #2 %533 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 2 %534 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp120 to %union.anon** - store %union.anon* %533, %union.anon** %534, align 8, !tbaa !52 + store %union.anon* %533, %union.anon** %534, align 8, !tbaa !58 %535 = bitcast %union.anon* %533 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %535, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.55, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1293 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1293, align 8, !tbaa !53 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %535, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.73, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1302 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1302, align 8, !tbaa !59 %536 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %536, align 1, !tbaa !36 - %537 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !53, !noalias !155 - %538 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56, !noalias !155 - %call3.i.i.i1230 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp120, i64 0, i64 0, i8* %538, i64 %537) #7, !noalias !155 + store i8 0, i8* %536, align 1, !tbaa !42 + %537 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !59, !noalias !173 + %538 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62, !noalias !173 + %call3.i.i.i1276 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp120, i64 0, i64 0, i8* %538, i64 %537) #2, !noalias !173 %539 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 2 %540 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_b_path to %union.anon** - store %union.anon* %539, %union.anon** %540, align 8, !tbaa !52, !alias.scope !155 - %_M_p.i.i23.i.i1231 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 0, i32 0 - %541 = load i8*, i8** %_M_p.i.i23.i.i1231, align 8, !tbaa !56 - %542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 2 - %arraydecay.i.i.i.i1232 = bitcast %union.anon* %542 to i8* - %cmp.i.i.i1233 = icmp eq i8* %541, %arraydecay.i.i.i.i1232 - br i1 %cmp.i.i.i1233, label %if.then.i.i1235, label %if.else.i.i1239 - -if.then.i.i1235: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338 - %arraydecay.i.i.i1234 = bitcast %union.anon* %539 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1234, i8* %541, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242 - -if.else.i.i1239: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338 - %_M_p.i21.i.i1236 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 - store i8* %541, i8** %_M_p.i21.i.i1236, align 8, !tbaa !56, !alias.scope !155 - %_M_allocated_capacity.i.i1237 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 2, i32 0 - %543 = load i64, i64* %_M_allocated_capacity.i.i1237, align 8, !tbaa !12 - %_M_allocated_capacity.i.i.i1238 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 2, i32 0 - store i64 %543, i64* %_M_allocated_capacity.i.i.i1238, align 8, !tbaa !12, !alias.scope !155 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242: ; preds = %if.then.i.i1235, %if.else.i.i1239 - %_M_string_length.i20.i.i1240 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 1 - %544 = load i64, i64* %_M_string_length.i20.i.i1240, align 8, !tbaa !53 - %_M_string_length.i.i2.i1241 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 1 - store i64 %544, i64* %_M_string_length.i.i2.i1241, align 8, !tbaa !53, !alias.scope !155 - %545 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1230 to %union.anon** - store %union.anon* %542, %union.anon** %545, align 8, !tbaa !56 - store i64 0, i64* %_M_string_length.i20.i.i1240, align 8, !tbaa !53 - store i8 0, i8* %arraydecay.i.i.i.i1232, align 1, !tbaa !36 - %_M_p.i.i.i.i1223 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 0, i32 0 - %546 = load i8*, i8** %_M_p.i.i.i.i1223, align 8, !tbaa !56 - %cmp.i.i.i1225 = icmp eq i8* %546, %535 - br i1 %cmp.i.i.i1225, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227, label %if.then.i.i1226 - -if.then.i.i1226: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242 - call void @_ZdlPv(i8* %546) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242, %if.then.i.i1226 - call void @llvm.lifetime.end(i64 32, i8* nonnull %532) #7 - %_M_p.i.i1222 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 - %547 = load i8*, i8** %_M_p.i.i1222, align 8, !tbaa !56 - %call123 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %547, i32 0, i32 1, i32 10, i32 1, i32 1) - %_M_p.i.i1184 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 0, i32 0 - %548 = load i8*, i8** %_M_p.i.i1184, align 8, !tbaa !56 - %call125 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %548, i32 0, i32 2000, i32 3, i32 32, i32 32) - %_M_p.i.i1183 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 - %549 = load i8*, i8** %_M_p.i.i1183, align 8, !tbaa !56 - %call.i = call noalias i8* @malloc(i64 2000) #7 - %call1.i = call %struct._IO_FILE* @fopen(i8* %549, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) #7 + store %union.anon* %539, %union.anon** %540, align 8, !tbaa !58, !alias.scope !173 + %_M_p.i.i23.i.i1277 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 0, i32 0 + %541 = load i8*, i8** %_M_p.i.i23.i.i1277, align 8, !tbaa !62 + %542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 2 + %arraydecay.i.i.i.i1278 = bitcast %union.anon* %542 to i8* + %cmp.i.i.i1279 = icmp eq i8* %541, %arraydecay.i.i.i.i1278 + br i1 %cmp.i.i.i1279, label %if.then.i.i1281, label %if.else.i.i1285 + +if.then.i.i1281: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384 + %arraydecay.i.i.i1280 = bitcast %union.anon* %539 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1280, i8* %541, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288 + +if.else.i.i1285: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384 + %_M_p.i21.i.i1282 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 + store i8* %541, i8** %_M_p.i21.i.i1282, align 8, !tbaa !62, !alias.scope !173 + %_M_allocated_capacity.i.i1283 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 2, i32 0 + %543 = load i64, i64* %_M_allocated_capacity.i.i1283, align 8, !tbaa !15 + %_M_allocated_capacity.i.i.i1284 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 2, i32 0 + store i64 %543, i64* %_M_allocated_capacity.i.i.i1284, align 8, !tbaa !15, !alias.scope !173 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288: ; preds = %if.then.i.i1281, %if.else.i.i1285 + %_M_string_length.i20.i.i1286 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 1 + %544 = load i64, i64* %_M_string_length.i20.i.i1286, align 8, !tbaa !59 + %_M_string_length.i.i2.i1287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 1 + store i64 %544, i64* %_M_string_length.i.i2.i1287, align 8, !tbaa !59, !alias.scope !173 + %545 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1276 to %union.anon** + store %union.anon* %542, %union.anon** %545, align 8, !tbaa !62 + store i64 0, i64* %_M_string_length.i20.i.i1286, align 8, !tbaa !59 + store i8 0, i8* %arraydecay.i.i.i.i1278, align 1, !tbaa !42 + %_M_p.i.i.i.i1232 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 0, i32 0 + %546 = load i8*, i8** %_M_p.i.i.i.i1232, align 8, !tbaa !62 + %cmp.i.i.i1234 = icmp eq i8* %546, %535 + br i1 %cmp.i.i.i1234, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236, label %if.then.i.i1235 + +if.then.i.i1235: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288 + call void @_ZdlPv(i8* %546) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288, %if.then.i.i1235 + call void @llvm.lifetime.end(i64 32, i8* nonnull %532) #2 + %_M_p.i.i1231 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 + %547 = load i8*, i8** %_M_p.i.i1231, align 8, !tbaa !62 + %call123 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %547, i32 0, i64 1, i64 10, i64 1, i64 1) + %_M_p.i.i1230 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 0, i32 0 + %548 = load i8*, i8** %_M_p.i.i1230, align 8, !tbaa !62 + %call125 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %548, i32 0, i64 2000, i64 3, i64 32, i64 32) + %_M_p.i.i1192 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 + %549 = load i8*, i8** %_M_p.i.i1192, align 8, !tbaa !62 + %call.i = call noalias i8* @malloc(i64 8000) #2 + %call1.i = call %struct._IO_FILE* @fopen(i8* %549, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) #2 %cmp.i = icmp eq %struct._IO_FILE* %call1.i, null - br i1 %cmp.i, label %if.then.i, label %_Z10readLabelsPKci.exit + br i1 %cmp.i, label %if.then.i, label %_Z11readLabels3PKci.exit -if.then.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227 - %call2.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.18, i64 0, i64 0), i8* %549) #7 - call void @abort() #8 +if.then.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236 + %call2.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %549) #2 + call void @abort() #13 unreachable -_Z10readLabelsPKci.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227 - %call5.i = call i64 @fread(i8* %call.i, i64 1, i64 2000, %struct._IO_FILE* nonnull %call1.i) #7 - %call6.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.19, i64 0, i64 0), i64 %call5.i) #7 - call void @__visc__init() #7 - %call128 = call noalias i8* @malloc(i64 512) #7 +_Z11readLabels3PKci.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236 + %550 = bitcast i8* %call.i to i32* + %call5.i = call i64 @fread(i8* %call.i, i64 1, i64 8000, %struct._IO_FILE* nonnull %call1.i) #2 + %call6.i = call i32 @fclose(%struct._IO_FILE* nonnull %call1.i) #2 + call void @__visc__init() #2 + %call128 = call noalias i8* @malloc(i64 512) #2 %input129 = bitcast i8* %call128 to i8** - %550 = bitcast i8* %call128 to %struct.Tensor** - store %struct.Tensor* %call125, %struct.Tensor** %550, align 1, !tbaa !158 + %551 = bitcast i8* %call128 to %struct.Tensor** + store %struct.Tensor* %call125, %struct.Tensor** %551, align 1, !tbaa !176 %input_bytes = getelementptr inbounds i8, i8* %call128, i64 8 - %551 = bitcast i8* %input_bytes to i64* - store i64 0, i64* %551, align 1, !tbaa !161 + %552 = bitcast i8* %input_bytes to i64* + store i64 0, i64* %552, align 1, !tbaa !179 %conv2d_1_w130 = getelementptr inbounds i8, i8* %call128, i64 16 - %552 = bitcast i8* %conv2d_1_w130 to %struct.Tensor** - store %struct.Tensor* %call7, %struct.Tensor** %552, align 1, !tbaa !162 + %553 = bitcast i8* %conv2d_1_w130 to %struct.Tensor** + store %struct.Tensor* %call7, %struct.Tensor** %553, align 1, !tbaa !180 %conv2d_1_w_bytes = getelementptr inbounds i8, i8* %call128, i64 24 - %553 = bitcast i8* %conv2d_1_w_bytes to i64* - store i64 0, i64* %553, align 1, !tbaa !163 + %554 = bitcast i8* %conv2d_1_w_bytes to i64* + store i64 0, i64* %554, align 1, !tbaa !181 %conv2d_1_b131 = getelementptr inbounds i8, i8* %call128, i64 32 - %554 = bitcast i8* %conv2d_1_b131 to %struct.Tensor** - store %struct.Tensor* %call11, %struct.Tensor** %554, align 1, !tbaa !164 + %555 = bitcast i8* %conv2d_1_b131 to %struct.Tensor** + store %struct.Tensor* %call11, %struct.Tensor** %555, align 1, !tbaa !182 %conv2d_1_b_bytes = getelementptr inbounds i8, i8* %call128, i64 40 - %555 = bitcast i8* %conv2d_1_b_bytes to i64* - store i64 0, i64* %555, align 1, !tbaa !165 + %556 = bitcast i8* %conv2d_1_b_bytes to i64* + store i64 0, i64* %556, align 1, !tbaa !183 %conv2d_2_w132 = getelementptr inbounds i8, i8* %call128, i64 48 - %556 = bitcast i8* %conv2d_2_w132 to %struct.Tensor** - store %struct.Tensor* %call15, %struct.Tensor** %556, align 1, !tbaa !166 + %557 = bitcast i8* %conv2d_2_w132 to %struct.Tensor** + store %struct.Tensor* %call15, %struct.Tensor** %557, align 1, !tbaa !184 %conv2d_2_w_bytes = getelementptr inbounds i8, i8* %call128, i64 56 - %557 = bitcast i8* %conv2d_2_w_bytes to i64* - store i64 0, i64* %557, align 1, !tbaa !167 + %558 = bitcast i8* %conv2d_2_w_bytes to i64* + store i64 0, i64* %558, align 1, !tbaa !185 %conv2d_2_b133 = getelementptr inbounds i8, i8* %call128, i64 64 - %558 = bitcast i8* %conv2d_2_b133 to %struct.Tensor** - store %struct.Tensor* %call19, %struct.Tensor** %558, align 1, !tbaa !168 + %559 = bitcast i8* %conv2d_2_b133 to %struct.Tensor** + store %struct.Tensor* %call19, %struct.Tensor** %559, align 1, !tbaa !186 %conv2d_2_b_bytes = getelementptr inbounds i8, i8* %call128, i64 72 - %559 = bitcast i8* %conv2d_2_b_bytes to i64* - store i64 0, i64* %559, align 1, !tbaa !169 + %560 = bitcast i8* %conv2d_2_b_bytes to i64* + store i64 0, i64* %560, align 1, !tbaa !187 %conv2d_3_w134 = getelementptr inbounds i8, i8* %call128, i64 80 - %560 = bitcast i8* %conv2d_3_w134 to %struct.Tensor** - store %struct.Tensor* %call23, %struct.Tensor** %560, align 1, !tbaa !170 + %561 = bitcast i8* %conv2d_3_w134 to %struct.Tensor** + store %struct.Tensor* %call23, %struct.Tensor** %561, align 1, !tbaa !188 %conv2d_3_w_bytes = getelementptr inbounds i8, i8* %call128, i64 88 - %561 = bitcast i8* %conv2d_3_w_bytes to i64* - store i64 0, i64* %561, align 1, !tbaa !171 + %562 = bitcast i8* %conv2d_3_w_bytes to i64* + store i64 0, i64* %562, align 1, !tbaa !189 %conv2d_3_b135 = getelementptr inbounds i8, i8* %call128, i64 96 - %562 = bitcast i8* %conv2d_3_b135 to %struct.Tensor** - store %struct.Tensor* %call27, %struct.Tensor** %562, align 1, !tbaa !172 + %563 = bitcast i8* %conv2d_3_b135 to %struct.Tensor** + store %struct.Tensor* %call27, %struct.Tensor** %563, align 1, !tbaa !190 %conv2d_3_b_bytes = getelementptr inbounds i8, i8* %call128, i64 104 - %563 = bitcast i8* %conv2d_3_b_bytes to i64* - store i64 0, i64* %563, align 1, !tbaa !173 + %564 = bitcast i8* %conv2d_3_b_bytes to i64* + store i64 0, i64* %564, align 1, !tbaa !191 %conv2d_4_w136 = getelementptr inbounds i8, i8* %call128, i64 112 - %564 = bitcast i8* %conv2d_4_w136 to %struct.Tensor** - store %struct.Tensor* %call31, %struct.Tensor** %564, align 1, !tbaa !174 + %565 = bitcast i8* %conv2d_4_w136 to %struct.Tensor** + store %struct.Tensor* %call31, %struct.Tensor** %565, align 1, !tbaa !192 %conv2d_4_w_bytes = getelementptr inbounds i8, i8* %call128, i64 120 - %565 = bitcast i8* %conv2d_4_w_bytes to i64* - store i64 0, i64* %565, align 1, !tbaa !175 + %566 = bitcast i8* %conv2d_4_w_bytes to i64* + store i64 0, i64* %566, align 1, !tbaa !193 %conv2d_4_b137 = getelementptr inbounds i8, i8* %call128, i64 128 - %566 = bitcast i8* %conv2d_4_b137 to %struct.Tensor** - store %struct.Tensor* %call35, %struct.Tensor** %566, align 1, !tbaa !176 + %567 = bitcast i8* %conv2d_4_b137 to %struct.Tensor** + store %struct.Tensor* %call35, %struct.Tensor** %567, align 1, !tbaa !194 %conv2d_4_b_bytes = getelementptr inbounds i8, i8* %call128, i64 136 - %567 = bitcast i8* %conv2d_4_b_bytes to i64* - store i64 0, i64* %567, align 1, !tbaa !177 + %568 = bitcast i8* %conv2d_4_b_bytes to i64* + store i64 0, i64* %568, align 1, !tbaa !195 %conv2d_5_w138 = getelementptr inbounds i8, i8* %call128, i64 144 - %568 = bitcast i8* %conv2d_5_w138 to %struct.Tensor** - store %struct.Tensor* %call39, %struct.Tensor** %568, align 1, !tbaa !178 + %569 = bitcast i8* %conv2d_5_w138 to %struct.Tensor** + store %struct.Tensor* %call39, %struct.Tensor** %569, align 1, !tbaa !196 %conv2d_5_w_bytes = getelementptr inbounds i8, i8* %call128, i64 152 - %569 = bitcast i8* %conv2d_5_w_bytes to i64* - store i64 0, i64* %569, align 1, !tbaa !179 + %570 = bitcast i8* %conv2d_5_w_bytes to i64* + store i64 0, i64* %570, align 1, !tbaa !197 %conv2d_5_b139 = getelementptr inbounds i8, i8* %call128, i64 160 - %570 = bitcast i8* %conv2d_5_b139 to %struct.Tensor** - store %struct.Tensor* %call43, %struct.Tensor** %570, align 1, !tbaa !180 + %571 = bitcast i8* %conv2d_5_b139 to %struct.Tensor** + store %struct.Tensor* %call43, %struct.Tensor** %571, align 1, !tbaa !198 %conv2d_5_b_bytes = getelementptr inbounds i8, i8* %call128, i64 168 - %571 = bitcast i8* %conv2d_5_b_bytes to i64* - store i64 0, i64* %571, align 1, !tbaa !181 + %572 = bitcast i8* %conv2d_5_b_bytes to i64* + store i64 0, i64* %572, align 1, !tbaa !199 %conv2d_6_w140 = getelementptr inbounds i8, i8* %call128, i64 176 - %572 = bitcast i8* %conv2d_6_w140 to %struct.Tensor** - store %struct.Tensor* %call47, %struct.Tensor** %572, align 1, !tbaa !182 + %573 = bitcast i8* %conv2d_6_w140 to %struct.Tensor** + store %struct.Tensor* %call47, %struct.Tensor** %573, align 1, !tbaa !200 %conv2d_6_w_bytes = getelementptr inbounds i8, i8* %call128, i64 184 - %573 = bitcast i8* %conv2d_6_w_bytes to i64* - store i64 0, i64* %573, align 1, !tbaa !183 + %574 = bitcast i8* %conv2d_6_w_bytes to i64* + store i64 0, i64* %574, align 1, !tbaa !201 %conv2d_6_b141 = getelementptr inbounds i8, i8* %call128, i64 192 - %574 = bitcast i8* %conv2d_6_b141 to %struct.Tensor** - store %struct.Tensor* %call51, %struct.Tensor** %574, align 1, !tbaa !184 + %575 = bitcast i8* %conv2d_6_b141 to %struct.Tensor** + store %struct.Tensor* %call51, %struct.Tensor** %575, align 1, !tbaa !202 %conv2d_6_b_bytes = getelementptr inbounds i8, i8* %call128, i64 200 - %575 = bitcast i8* %conv2d_6_b_bytes to i64* - store i64 0, i64* %575, align 1, !tbaa !185 + %576 = bitcast i8* %conv2d_6_b_bytes to i64* + store i64 0, i64* %576, align 1, !tbaa !203 %conv2d_7_w142 = getelementptr inbounds i8, i8* %call128, i64 208 - %576 = bitcast i8* %conv2d_7_w142 to %struct.Tensor** - store %struct.Tensor* %call55, %struct.Tensor** %576, align 1, !tbaa !186 + %577 = bitcast i8* %conv2d_7_w142 to %struct.Tensor** + store %struct.Tensor* %call55, %struct.Tensor** %577, align 1, !tbaa !204 %conv2d_7_w_bytes = getelementptr inbounds i8, i8* %call128, i64 216 - %577 = bitcast i8* %conv2d_7_w_bytes to i64* - store i64 0, i64* %577, align 1, !tbaa !187 + %578 = bitcast i8* %conv2d_7_w_bytes to i64* + store i64 0, i64* %578, align 1, !tbaa !205 %conv2d_7_b143 = getelementptr inbounds i8, i8* %call128, i64 224 - %578 = bitcast i8* %conv2d_7_b143 to %struct.Tensor** - store %struct.Tensor* %call59, %struct.Tensor** %578, align 1, !tbaa !188 + %579 = bitcast i8* %conv2d_7_b143 to %struct.Tensor** + store %struct.Tensor* %call59, %struct.Tensor** %579, align 1, !tbaa !206 %conv2d_7_b_bytes = getelementptr inbounds i8, i8* %call128, i64 232 - %579 = bitcast i8* %conv2d_7_b_bytes to i64* - store i64 0, i64* %579, align 1, !tbaa !189 + %580 = bitcast i8* %conv2d_7_b_bytes to i64* + store i64 0, i64* %580, align 1, !tbaa !207 %conv2d_8_w144 = getelementptr inbounds i8, i8* %call128, i64 240 - %580 = bitcast i8* %conv2d_8_w144 to %struct.Tensor** - store %struct.Tensor* %call63, %struct.Tensor** %580, align 1, !tbaa !190 + %581 = bitcast i8* %conv2d_8_w144 to %struct.Tensor** + store %struct.Tensor* %call63, %struct.Tensor** %581, align 1, !tbaa !208 %conv2d_8_w_bytes = getelementptr inbounds i8, i8* %call128, i64 248 - %581 = bitcast i8* %conv2d_8_w_bytes to i64* - store i64 0, i64* %581, align 1, !tbaa !191 + %582 = bitcast i8* %conv2d_8_w_bytes to i64* + store i64 0, i64* %582, align 1, !tbaa !209 %conv2d_8_b145 = getelementptr inbounds i8, i8* %call128, i64 256 - %582 = bitcast i8* %conv2d_8_b145 to %struct.Tensor** - store %struct.Tensor* %call67, %struct.Tensor** %582, align 1, !tbaa !192 + %583 = bitcast i8* %conv2d_8_b145 to %struct.Tensor** + store %struct.Tensor* %call67, %struct.Tensor** %583, align 1, !tbaa !210 %conv2d_8_b_bytes = getelementptr inbounds i8, i8* %call128, i64 264 - %583 = bitcast i8* %conv2d_8_b_bytes to i64* - store i64 0, i64* %583, align 1, !tbaa !193 + %584 = bitcast i8* %conv2d_8_b_bytes to i64* + store i64 0, i64* %584, align 1, !tbaa !211 %conv2d_9_w146 = getelementptr inbounds i8, i8* %call128, i64 272 - %584 = bitcast i8* %conv2d_9_w146 to %struct.Tensor** - store %struct.Tensor* %call71, %struct.Tensor** %584, align 1, !tbaa !194 + %585 = bitcast i8* %conv2d_9_w146 to %struct.Tensor** + store %struct.Tensor* %call71, %struct.Tensor** %585, align 1, !tbaa !212 %conv2d_9_w_bytes = getelementptr inbounds i8, i8* %call128, i64 280 - %585 = bitcast i8* %conv2d_9_w_bytes to i64* - store i64 0, i64* %585, align 1, !tbaa !195 + %586 = bitcast i8* %conv2d_9_w_bytes to i64* + store i64 0, i64* %586, align 1, !tbaa !213 %conv2d_9_b147 = getelementptr inbounds i8, i8* %call128, i64 288 - %586 = bitcast i8* %conv2d_9_b147 to %struct.Tensor** - store %struct.Tensor* %call75, %struct.Tensor** %586, align 1, !tbaa !196 + %587 = bitcast i8* %conv2d_9_b147 to %struct.Tensor** + store %struct.Tensor* %call75, %struct.Tensor** %587, align 1, !tbaa !214 %conv2d_9_b_bytes = getelementptr inbounds i8, i8* %call128, i64 296 - %587 = bitcast i8* %conv2d_9_b_bytes to i64* - store i64 0, i64* %587, align 1, !tbaa !197 + %588 = bitcast i8* %conv2d_9_b_bytes to i64* + store i64 0, i64* %588, align 1, !tbaa !215 %conv2d_10_w148 = getelementptr inbounds i8, i8* %call128, i64 304 - %588 = bitcast i8* %conv2d_10_w148 to %struct.Tensor** - store %struct.Tensor* %call79, %struct.Tensor** %588, align 1, !tbaa !198 + %589 = bitcast i8* %conv2d_10_w148 to %struct.Tensor** + store %struct.Tensor* %call79, %struct.Tensor** %589, align 1, !tbaa !216 %conv2d_10_w_bytes = getelementptr inbounds i8, i8* %call128, i64 312 - %589 = bitcast i8* %conv2d_10_w_bytes to i64* - store i64 0, i64* %589, align 1, !tbaa !199 + %590 = bitcast i8* %conv2d_10_w_bytes to i64* + store i64 0, i64* %590, align 1, !tbaa !217 %conv2d_10_b149 = getelementptr inbounds i8, i8* %call128, i64 320 - %590 = bitcast i8* %conv2d_10_b149 to %struct.Tensor** - store %struct.Tensor* %call83, %struct.Tensor** %590, align 1, !tbaa !200 + %591 = bitcast i8* %conv2d_10_b149 to %struct.Tensor** + store %struct.Tensor* %call83, %struct.Tensor** %591, align 1, !tbaa !218 %conv2d_10_b_bytes = getelementptr inbounds i8, i8* %call128, i64 328 - %591 = bitcast i8* %conv2d_10_b_bytes to i64* - store i64 0, i64* %591, align 1, !tbaa !201 + %592 = bitcast i8* %conv2d_10_b_bytes to i64* + store i64 0, i64* %592, align 1, !tbaa !219 %conv2d_11_w150 = getelementptr inbounds i8, i8* %call128, i64 336 - %592 = bitcast i8* %conv2d_11_w150 to %struct.Tensor** - store %struct.Tensor* %call87, %struct.Tensor** %592, align 1, !tbaa !202 + %593 = bitcast i8* %conv2d_11_w150 to %struct.Tensor** + store %struct.Tensor* %call87, %struct.Tensor** %593, align 1, !tbaa !220 %conv2d_11_w_bytes = getelementptr inbounds i8, i8* %call128, i64 344 - %593 = bitcast i8* %conv2d_11_w_bytes to i64* - store i64 0, i64* %593, align 1, !tbaa !203 + %594 = bitcast i8* %conv2d_11_w_bytes to i64* + store i64 0, i64* %594, align 1, !tbaa !221 %conv2d_11_b151 = getelementptr inbounds i8, i8* %call128, i64 352 - %594 = bitcast i8* %conv2d_11_b151 to %struct.Tensor** - store %struct.Tensor* %call91, %struct.Tensor** %594, align 1, !tbaa !204 + %595 = bitcast i8* %conv2d_11_b151 to %struct.Tensor** + store %struct.Tensor* %call91, %struct.Tensor** %595, align 1, !tbaa !222 %conv2d_11_b_bytes = getelementptr inbounds i8, i8* %call128, i64 360 - %595 = bitcast i8* %conv2d_11_b_bytes to i64* - store i64 0, i64* %595, align 1, !tbaa !205 + %596 = bitcast i8* %conv2d_11_b_bytes to i64* + store i64 0, i64* %596, align 1, !tbaa !223 %conv2d_12_w152 = getelementptr inbounds i8, i8* %call128, i64 368 - %596 = bitcast i8* %conv2d_12_w152 to %struct.Tensor** - store %struct.Tensor* %call95, %struct.Tensor** %596, align 1, !tbaa !206 + %597 = bitcast i8* %conv2d_12_w152 to %struct.Tensor** + store %struct.Tensor* %call95, %struct.Tensor** %597, align 1, !tbaa !224 %conv2d_12_w_bytes = getelementptr inbounds i8, i8* %call128, i64 376 - %597 = bitcast i8* %conv2d_12_w_bytes to i64* - store i64 0, i64* %597, align 1, !tbaa !207 + %598 = bitcast i8* %conv2d_12_w_bytes to i64* + store i64 0, i64* %598, align 1, !tbaa !225 %conv2d_12_b153 = getelementptr inbounds i8, i8* %call128, i64 384 - %598 = bitcast i8* %conv2d_12_b153 to %struct.Tensor** - store %struct.Tensor* %call99, %struct.Tensor** %598, align 1, !tbaa !208 + %599 = bitcast i8* %conv2d_12_b153 to %struct.Tensor** + store %struct.Tensor* %call99, %struct.Tensor** %599, align 1, !tbaa !226 %conv2d_12_b_bytes = getelementptr inbounds i8, i8* %call128, i64 392 - %599 = bitcast i8* %conv2d_12_b_bytes to i64* - store i64 0, i64* %599, align 1, !tbaa !209 + %600 = bitcast i8* %conv2d_12_b_bytes to i64* + store i64 0, i64* %600, align 1, !tbaa !227 %conv2d_13_w154 = getelementptr inbounds i8, i8* %call128, i64 400 - %600 = bitcast i8* %conv2d_13_w154 to %struct.Tensor** - store %struct.Tensor* %call103, %struct.Tensor** %600, align 1, !tbaa !210 + %601 = bitcast i8* %conv2d_13_w154 to %struct.Tensor** + store %struct.Tensor* %call103, %struct.Tensor** %601, align 1, !tbaa !228 %conv2d_13_w_bytes = getelementptr inbounds i8, i8* %call128, i64 408 - %601 = bitcast i8* %conv2d_13_w_bytes to i64* - store i64 0, i64* %601, align 1, !tbaa !211 + %602 = bitcast i8* %conv2d_13_w_bytes to i64* + store i64 0, i64* %602, align 1, !tbaa !229 %conv2d_13_b155 = getelementptr inbounds i8, i8* %call128, i64 416 - %602 = bitcast i8* %conv2d_13_b155 to %struct.Tensor** - store %struct.Tensor* %call107, %struct.Tensor** %602, align 1, !tbaa !212 + %603 = bitcast i8* %conv2d_13_b155 to %struct.Tensor** + store %struct.Tensor* %call107, %struct.Tensor** %603, align 1, !tbaa !230 %conv2d_13_b_bytes = getelementptr inbounds i8, i8* %call128, i64 424 - %603 = bitcast i8* %conv2d_13_b_bytes to i64* - store i64 0, i64* %603, align 1, !tbaa !213 + %604 = bitcast i8* %conv2d_13_b_bytes to i64* + store i64 0, i64* %604, align 1, !tbaa !231 %dense_1_w156 = getelementptr inbounds i8, i8* %call128, i64 432 - %604 = bitcast i8* %dense_1_w156 to %struct.Tensor** - store %struct.Tensor* %call111, %struct.Tensor** %604, align 1, !tbaa !214 + %605 = bitcast i8* %dense_1_w156 to %struct.Tensor** + store %struct.Tensor* %call111, %struct.Tensor** %605, align 1, !tbaa !232 %dense_1_w_bytes = getelementptr inbounds i8, i8* %call128, i64 440 - %605 = bitcast i8* %dense_1_w_bytes to i64* - store i64 0, i64* %605, align 1, !tbaa !215 + %606 = bitcast i8* %dense_1_w_bytes to i64* + store i64 0, i64* %606, align 1, !tbaa !233 %dense_1_b157 = getelementptr inbounds i8, i8* %call128, i64 448 - %606 = bitcast i8* %dense_1_b157 to %struct.Tensor** - store %struct.Tensor* %call115, %struct.Tensor** %606, align 1, !tbaa !216 + %607 = bitcast i8* %dense_1_b157 to %struct.Tensor** + store %struct.Tensor* %call115, %struct.Tensor** %607, align 1, !tbaa !234 %dense_1_b_bytes = getelementptr inbounds i8, i8* %call128, i64 456 - %607 = bitcast i8* %dense_1_b_bytes to i64* - store i64 0, i64* %607, align 1, !tbaa !217 + %608 = bitcast i8* %dense_1_b_bytes to i64* + store i64 0, i64* %608, align 1, !tbaa !235 %dense_2_w158 = getelementptr inbounds i8, i8* %call128, i64 464 - %608 = bitcast i8* %dense_2_w158 to %struct.Tensor** - store %struct.Tensor* %call119, %struct.Tensor** %608, align 1, !tbaa !218 + %609 = bitcast i8* %dense_2_w158 to %struct.Tensor** + store %struct.Tensor* %call119, %struct.Tensor** %609, align 1, !tbaa !236 %dense_2_w_bytes = getelementptr inbounds i8, i8* %call128, i64 472 - %609 = bitcast i8* %dense_2_w_bytes to i64* - store i64 0, i64* %609, align 1, !tbaa !219 + %610 = bitcast i8* %dense_2_w_bytes to i64* + store i64 0, i64* %610, align 1, !tbaa !237 %dense_2_b159 = getelementptr inbounds i8, i8* %call128, i64 480 - %610 = bitcast i8* %dense_2_b159 to %struct.Tensor** - store %struct.Tensor* %call123, %struct.Tensor** %610, align 1, !tbaa !220 + %611 = bitcast i8* %dense_2_b159 to %struct.Tensor** + store %struct.Tensor* %call123, %struct.Tensor** %611, align 1, !tbaa !238 %dense_2_b_bytes = getelementptr inbounds i8, i8* %call128, i64 488 - %611 = bitcast i8* %dense_2_b_bytes to i64* - store i64 0, i64* %611, align 1, !tbaa !221 - %call160 = call i8* (i32, ...) @__visc__launch(i32 0, void (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* nonnull @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m, i8* %call128) #7 - call void @__visc__wait(i8* %call160) #7 - %612 = load i8*, i8** %input129, align 1, !tbaa !158 - call void @hpvm_request_tensor(i8* %612, i32 0) #7 - call void @__visc__cleanup() #7 - call void @_Z16computeAccuracy2PhiPv(i8* %call.i, i32 undef, i8* %612) - %613 = load i8*, i8** %_M_p.i.i1222, align 8, !tbaa !56 - %arraydecay.i.i.i.i1031 = bitcast %union.anon* %539 to i8* - %cmp.i.i.i1032 = icmp eq i8* %613, %arraydecay.i.i.i.i1031 - br i1 %cmp.i.i.i1032, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034, label %if.then.i.i1033 - -if.then.i.i1033: ; preds = %_Z10readLabelsPKci.exit - call void @_ZdlPv(i8* %613) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034: ; preds = %_Z10readLabelsPKci.exit, %if.then.i.i1033 - call void @llvm.lifetime.end(i64 32, i8* nonnull %531) #7 - %614 = load i8*, i8** %_M_p.i.i1333, align 8, !tbaa !56 - %arraydecay.i.i.i.i989 = bitcast %union.anon* %522 to i8* - %cmp.i.i.i990 = icmp eq i8* %614, %arraydecay.i.i.i.i989 - br i1 %cmp.i.i.i990, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992, label %if.then.i.i991 - -if.then.i.i991: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034 - call void @_ZdlPv(i8* %614) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034, %if.then.i.i991 - call void @llvm.lifetime.end(i64 32, i8* nonnull %514) #7 - %615 = load i8*, i8** %_M_p.i.i1481, align 8, !tbaa !56 - %arraydecay.i.i.i.i984 = bitcast %union.anon* %505 to i8* - %cmp.i.i.i985 = icmp eq i8* %615, %arraydecay.i.i.i.i984 - br i1 %cmp.i.i.i985, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987, label %if.then.i.i986 - -if.then.i.i986: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992 - call void @_ZdlPv(i8* %615) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992, %if.then.i.i986 - call void @llvm.lifetime.end(i64 32, i8* nonnull %497) #7 - %616 = load i8*, i8** %_M_p.i.i1591, align 8, !tbaa !56 - %arraydecay.i.i.i.i942 = bitcast %union.anon* %488 to i8* - %cmp.i.i.i943 = icmp eq i8* %616, %arraydecay.i.i.i.i942 - br i1 %cmp.i.i.i943, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945, label %if.then.i.i944 - -if.then.i.i944: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987 - call void @_ZdlPv(i8* %616) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987, %if.then.i.i944 - call void @llvm.lifetime.end(i64 32, i8* nonnull %480) #7 - %617 = load i8*, i8** %_M_p.i.i1538, align 8, !tbaa !56 - %arraydecay.i.i.i.i937 = bitcast %union.anon* %471 to i8* - %cmp.i.i.i938 = icmp eq i8* %617, %arraydecay.i.i.i.i937 - br i1 %cmp.i.i.i938, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940, label %if.then.i.i939 - -if.then.i.i939: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945 - call void @_ZdlPv(i8* %617) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945, %if.then.i.i939 - call void @llvm.lifetime.end(i64 32, i8* nonnull %463) #7 - %618 = load i8*, i8** %_M_p.i.i1480, align 8, !tbaa !56 - %arraydecay.i.i.i.i895 = bitcast %union.anon* %454 to i8* - %cmp.i.i.i896 = icmp eq i8* %618, %arraydecay.i.i.i.i895 - br i1 %cmp.i.i.i896, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898, label %if.then.i.i897 - -if.then.i.i897: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940 - call void @_ZdlPv(i8* %618) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940, %if.then.i.i897 - call void @llvm.lifetime.end(i64 32, i8* nonnull %446) #7 - %619 = load i8*, i8** %_M_p.i.i1443, align 8, !tbaa !56 - %arraydecay.i.i.i.i890 = bitcast %union.anon* %437 to i8* - %cmp.i.i.i891 = icmp eq i8* %619, %arraydecay.i.i.i.i890 - br i1 %cmp.i.i.i891, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893, label %if.then.i.i892 - -if.then.i.i892: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898 - call void @_ZdlPv(i8* %619) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898, %if.then.i.i892 - call void @llvm.lifetime.end(i64 32, i8* nonnull %429) #7 - %620 = load i8*, i8** %_M_p.i.i1375, align 8, !tbaa !56 - %arraydecay.i.i.i.i848 = bitcast %union.anon* %420 to i8* - %cmp.i.i.i849 = icmp eq i8* %620, %arraydecay.i.i.i.i848 - br i1 %cmp.i.i.i849, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851, label %if.then.i.i850 - -if.then.i.i850: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893 - call void @_ZdlPv(i8* %620) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893, %if.then.i.i850 - call void @llvm.lifetime.end(i64 32, i8* nonnull %412) #7 - %621 = load i8*, i8** %_M_p.i.i1332, align 8, !tbaa !56 - %arraydecay.i.i.i.i843 = bitcast %union.anon* %403 to i8* - %cmp.i.i.i844 = icmp eq i8* %621, %arraydecay.i.i.i.i843 - br i1 %cmp.i.i.i844, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846, label %if.then.i.i845 - -if.then.i.i845: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851 - call void @_ZdlPv(i8* %621) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851, %if.then.i.i845 - call void @llvm.lifetime.end(i64 32, i8* nonnull %395) #7 - %622 = load i8*, i8** %_M_p.i.i1279, align 8, !tbaa !56 - %arraydecay.i.i.i.i801 = bitcast %union.anon* %386 to i8* - %cmp.i.i.i802 = icmp eq i8* %622, %arraydecay.i.i.i.i801 - br i1 %cmp.i.i.i802, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804, label %if.then.i.i803 - -if.then.i.i803: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846 - call void @_ZdlPv(i8* %622) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846, %if.then.i.i803 - call void @llvm.lifetime.end(i64 32, i8* nonnull %378) #7 - %623 = load i8*, i8** %_M_p.i.i1221, align 8, !tbaa !56 - %arraydecay.i.i.i.i796 = bitcast %union.anon* %369 to i8* - %cmp.i.i.i797 = icmp eq i8* %623, %arraydecay.i.i.i.i796 - br i1 %cmp.i.i.i797, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799, label %if.then.i.i798 - -if.then.i.i798: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804 - call void @_ZdlPv(i8* %623) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804, %if.then.i.i798 - call void @llvm.lifetime.end(i64 32, i8* nonnull %361) #7 - %624 = load i8*, i8** %_M_p.i.i1182, align 8, !tbaa !56 - %arraydecay.i.i.i.i754 = bitcast %union.anon* %352 to i8* - %cmp.i.i.i755 = icmp eq i8* %624, %arraydecay.i.i.i.i754 - br i1 %cmp.i.i.i755, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757, label %if.then.i.i756 - -if.then.i.i756: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799 - call void @_ZdlPv(i8* %624) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799, %if.then.i.i756 - call void @llvm.lifetime.end(i64 32, i8* nonnull %344) #7 - %625 = load i8*, i8** %_M_p.i.i1145, align 8, !tbaa !56 - %arraydecay.i.i.i.i749 = bitcast %union.anon* %335 to i8* - %cmp.i.i.i750 = icmp eq i8* %625, %arraydecay.i.i.i.i749 - br i1 %cmp.i.i.i750, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752, label %if.then.i.i751 - -if.then.i.i751: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757 - call void @_ZdlPv(i8* %625) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757, %if.then.i.i751 - call void @llvm.lifetime.end(i64 32, i8* nonnull %327) #7 - %626 = load i8*, i8** %_M_p.i.i1108, align 8, !tbaa !56 - %arraydecay.i.i.i.i707 = bitcast %union.anon* %318 to i8* - %cmp.i.i.i708 = icmp eq i8* %626, %arraydecay.i.i.i.i707 - br i1 %cmp.i.i.i708, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710, label %if.then.i.i709 - -if.then.i.i709: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752 - call void @_ZdlPv(i8* %626) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752, %if.then.i.i709 - call void @llvm.lifetime.end(i64 32, i8* nonnull %310) #7 - %627 = load i8*, i8** %_M_p.i.i1071, align 8, !tbaa !56 - %arraydecay.i.i.i.i702 = bitcast %union.anon* %301 to i8* - %cmp.i.i.i703 = icmp eq i8* %627, %arraydecay.i.i.i.i702 - br i1 %cmp.i.i.i703, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705, label %if.then.i.i704 - -if.then.i.i704: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710 - call void @_ZdlPv(i8* %627) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710, %if.then.i.i704 - call void @llvm.lifetime.end(i64 32, i8* nonnull %293) #7 - %628 = load i8*, i8** %_M_p.i.i1029, align 8, !tbaa !56 - %arraydecay.i.i.i.i660 = bitcast %union.anon* %284 to i8* - %cmp.i.i.i661 = icmp eq i8* %628, %arraydecay.i.i.i.i660 - br i1 %cmp.i.i.i661, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663, label %if.then.i.i662 - -if.then.i.i662: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705 - call void @_ZdlPv(i8* %628) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705, %if.then.i.i662 - call void @llvm.lifetime.end(i64 32, i8* nonnull %276) #7 - %629 = load i8*, i8** %_M_p.i.i982, align 8, !tbaa !56 - %arraydecay.i.i.i.i655 = bitcast %union.anon* %267 to i8* - %cmp.i.i.i656 = icmp eq i8* %629, %arraydecay.i.i.i.i655 - br i1 %cmp.i.i.i656, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658, label %if.then.i.i657 - -if.then.i.i657: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663 - call void @_ZdlPv(i8* %629) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663, %if.then.i.i657 - call void @llvm.lifetime.end(i64 32, i8* nonnull %259) #7 - %630 = load i8*, i8** %_M_p.i.i935, align 8, !tbaa !56 - %arraydecay.i.i.i.i613 = bitcast %union.anon* %250 to i8* - %cmp.i.i.i614 = icmp eq i8* %630, %arraydecay.i.i.i.i613 - br i1 %cmp.i.i.i614, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616, label %if.then.i.i615 - -if.then.i.i615: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658 - call void @_ZdlPv(i8* %630) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658, %if.then.i.i615 - call void @llvm.lifetime.end(i64 32, i8* nonnull %242) #7 - %631 = load i8*, i8** %_M_p.i.i888, align 8, !tbaa !56 - %arraydecay.i.i.i.i608 = bitcast %union.anon* %233 to i8* - %cmp.i.i.i609 = icmp eq i8* %631, %arraydecay.i.i.i.i608 - br i1 %cmp.i.i.i609, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611, label %if.then.i.i610 - -if.then.i.i610: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616 - call void @_ZdlPv(i8* %631) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616, %if.then.i.i610 - call void @llvm.lifetime.end(i64 32, i8* nonnull %225) #7 - %632 = load i8*, i8** %_M_p.i.i841, align 8, !tbaa !56 - %arraydecay.i.i.i.i566 = bitcast %union.anon* %216 to i8* - %cmp.i.i.i567 = icmp eq i8* %632, %arraydecay.i.i.i.i566 - br i1 %cmp.i.i.i567, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569, label %if.then.i.i568 - -if.then.i.i568: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611 - call void @_ZdlPv(i8* %632) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611, %if.then.i.i568 - call void @llvm.lifetime.end(i64 32, i8* nonnull %208) #7 - %633 = load i8*, i8** %_M_p.i.i794, align 8, !tbaa !56 - %arraydecay.i.i.i.i561 = bitcast %union.anon* %199 to i8* - %cmp.i.i.i562 = icmp eq i8* %633, %arraydecay.i.i.i.i561 - br i1 %cmp.i.i.i562, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564, label %if.then.i.i563 - -if.then.i.i563: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569 - call void @_ZdlPv(i8* %633) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569, %if.then.i.i563 - call void @llvm.lifetime.end(i64 32, i8* nonnull %191) #7 - %634 = load i8*, i8** %_M_p.i.i747, align 8, !tbaa !56 - %arraydecay.i.i.i.i519 = bitcast %union.anon* %182 to i8* - %cmp.i.i.i520 = icmp eq i8* %634, %arraydecay.i.i.i.i519 - br i1 %cmp.i.i.i520, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522, label %if.then.i.i521 - -if.then.i.i521: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564 - call void @_ZdlPv(i8* %634) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564, %if.then.i.i521 - call void @llvm.lifetime.end(i64 32, i8* nonnull %174) #7 - %635 = load i8*, i8** %_M_p.i.i700, align 8, !tbaa !56 - %arraydecay.i.i.i.i514 = bitcast %union.anon* %165 to i8* - %cmp.i.i.i515 = icmp eq i8* %635, %arraydecay.i.i.i.i514 - br i1 %cmp.i.i.i515, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517, label %if.then.i.i516 - -if.then.i.i516: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522 - call void @_ZdlPv(i8* %635) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522, %if.then.i.i516 - call void @llvm.lifetime.end(i64 32, i8* nonnull %157) #7 - %636 = load i8*, i8** %_M_p.i.i653, align 8, !tbaa !56 - %arraydecay.i.i.i.i472 = bitcast %union.anon* %148 to i8* - %cmp.i.i.i473 = icmp eq i8* %636, %arraydecay.i.i.i.i472 - br i1 %cmp.i.i.i473, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475, label %if.then.i.i474 - -if.then.i.i474: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517 - call void @_ZdlPv(i8* %636) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517, %if.then.i.i474 - call void @llvm.lifetime.end(i64 32, i8* nonnull %140) #7 - %637 = load i8*, i8** %_M_p.i.i606, align 8, !tbaa !56 - %arraydecay.i.i.i.i467 = bitcast %union.anon* %131 to i8* - %cmp.i.i.i468 = icmp eq i8* %637, %arraydecay.i.i.i.i467 - br i1 %cmp.i.i.i468, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470, label %if.then.i.i469 - -if.then.i.i469: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475 - call void @_ZdlPv(i8* %637) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475, %if.then.i.i469 - call void @llvm.lifetime.end(i64 32, i8* nonnull %123) #7 - %638 = load i8*, i8** %_M_p.i.i559, align 8, !tbaa !56 - %arraydecay.i.i.i.i425 = bitcast %union.anon* %114 to i8* - %cmp.i.i.i426 = icmp eq i8* %638, %arraydecay.i.i.i.i425 - br i1 %cmp.i.i.i426, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428, label %if.then.i.i427 - -if.then.i.i427: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470 - call void @_ZdlPv(i8* %638) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470, %if.then.i.i427 - call void @llvm.lifetime.end(i64 32, i8* nonnull %106) #7 - %639 = load i8*, i8** %_M_p.i.i512, align 8, !tbaa !56 - %arraydecay.i.i.i.i420 = bitcast %union.anon* %97 to i8* - %cmp.i.i.i421 = icmp eq i8* %639, %arraydecay.i.i.i.i420 - br i1 %cmp.i.i.i421, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423, label %if.then.i.i422 - -if.then.i.i422: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428 - call void @_ZdlPv(i8* %639) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428, %if.then.i.i422 - call void @llvm.lifetime.end(i64 32, i8* nonnull %89) #7 - %640 = load i8*, i8** %_M_p.i.i465, align 8, !tbaa !56 - %arraydecay.i.i.i.i378 = bitcast %union.anon* %80 to i8* - %cmp.i.i.i379 = icmp eq i8* %640, %arraydecay.i.i.i.i378 - br i1 %cmp.i.i.i379, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381, label %if.then.i.i380 - -if.then.i.i380: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423 - call void @_ZdlPv(i8* %640) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423, %if.then.i.i380 - call void @llvm.lifetime.end(i64 32, i8* nonnull %72) #7 - %641 = load i8*, i8** %_M_p.i.i418, align 8, !tbaa !56 - %arraydecay.i.i.i.i373 = bitcast %union.anon* %63 to i8* - %cmp.i.i.i374 = icmp eq i8* %641, %arraydecay.i.i.i.i373 - br i1 %cmp.i.i.i374, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376, label %if.then.i.i375 - -if.then.i.i375: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381 - call void @_ZdlPv(i8* %641) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381, %if.then.i.i375 - call void @llvm.lifetime.end(i64 32, i8* nonnull %55) #7 - %642 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !56 - %arraydecay.i.i.i.i332 = bitcast %union.anon* %46 to i8* - %cmp.i.i.i333 = icmp eq i8* %642, %arraydecay.i.i.i.i332 - br i1 %cmp.i.i.i333, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335, label %if.then.i.i334 - -if.then.i.i334: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376 - call void @_ZdlPv(i8* %642) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376, %if.then.i.i334 - call void @llvm.lifetime.end(i64 32, i8* nonnull %38) #7 - %643 = load i8*, i8** %_M_p.i.i1183, align 8, !tbaa !56 - %arraydecay.i.i.i.i291 = bitcast %union.anon* %30 to i8* - %cmp.i.i.i292 = icmp eq i8* %643, %arraydecay.i.i.i.i291 - br i1 %cmp.i.i.i292, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294, label %if.then.i.i293 - -if.then.i.i293: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335 - call void @_ZdlPv(i8* %643) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335, %if.then.i.i293 - call void @llvm.lifetime.end(i64 32, i8* nonnull %22) #7 - %644 = load i8*, i8** %_M_p.i.i1184, align 8, !tbaa !56 - %arraydecay.i.i.i.i262 = bitcast %union.anon* %14 to i8* - %cmp.i.i.i263 = icmp eq i8* %644, %arraydecay.i.i.i.i262 - br i1 %cmp.i.i.i263, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265, label %if.then.i.i264 - -if.then.i.i264: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294 - call void @_ZdlPv(i8* %644) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294, %if.then.i.i264 - call void @llvm.lifetime.end(i64 32, i8* nonnull %6) #7 - %645 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !56 - %cmp.i.i.i = icmp eq i8* %645, %3 + %612 = bitcast i8* %dense_2_b_bytes to i64* + store i64 0, i64* %612, align 1, !tbaa !239 + call void @startMemTracking() #2 + call void @startProfiling() #2 + %613 = load i8*, i8** %_M_p.i.i1230, align 8, !tbaa !62 + %call161 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %613, i32 0, i64 2000, i64 3, i64 32, i64 32) + store %struct.Tensor* %call161, %struct.Tensor** %551, align 1, !tbaa !176 + store i64 0, i64* %552, align 1, !tbaa !179 + %call164 = call i8* (i32, ...) @__visc__launch(i32 0, void (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* nonnull @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m, i8* %call128) #2 + call void @__visc__wait(i8* %call164) #2 + %614 = load i8*, i8** %input129, align 1, !tbaa !176 + call void @hpvm_request_tensor(i8* %614, i32 0) #2 + %call166 = call fast float @_Z16computeAccuracy3PjPv(i32* %550, i8* %614) + call void @freeBatchMemory() #2 + call void @stopProfiling() #2 + call void @__visc__cleanup() #2 + %615 = load i8*, i8** %_M_p.i.i1231, align 8, !tbaa !62 + %arraydecay.i.i.i.i1039 = bitcast %union.anon* %539 to i8* + %cmp.i.i.i1040 = icmp eq i8* %615, %arraydecay.i.i.i.i1039 + br i1 %cmp.i.i.i1040, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042, label %if.then.i.i1041 + +if.then.i.i1041: ; preds = %_Z11readLabels3PKci.exit + call void @_ZdlPv(i8* %615) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042: ; preds = %_Z11readLabels3PKci.exit, %if.then.i.i1041 + call void @llvm.lifetime.end(i64 32, i8* nonnull %531) #2 + %616 = load i8*, i8** %_M_p.i.i1379, align 8, !tbaa !62 + %arraydecay.i.i.i.i997 = bitcast %union.anon* %522 to i8* + %cmp.i.i.i998 = icmp eq i8* %616, %arraydecay.i.i.i.i997 + br i1 %cmp.i.i.i998, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000, label %if.then.i.i999 + +if.then.i.i999: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042 + call void @_ZdlPv(i8* %616) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042, %if.then.i.i999 + call void @llvm.lifetime.end(i64 32, i8* nonnull %514) #2 + %617 = load i8*, i8** %_M_p.i.i1490, align 8, !tbaa !62 + %arraydecay.i.i.i.i992 = bitcast %union.anon* %505 to i8* + %cmp.i.i.i993 = icmp eq i8* %617, %arraydecay.i.i.i.i992 + br i1 %cmp.i.i.i993, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995, label %if.then.i.i994 + +if.then.i.i994: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000 + call void @_ZdlPv(i8* %617) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000, %if.then.i.i994 + call void @llvm.lifetime.end(i64 32, i8* nonnull %497) #2 + %618 = load i8*, i8** %_M_p.i.i1600, align 8, !tbaa !62 + %arraydecay.i.i.i.i950 = bitcast %union.anon* %488 to i8* + %cmp.i.i.i951 = icmp eq i8* %618, %arraydecay.i.i.i.i950 + br i1 %cmp.i.i.i951, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953, label %if.then.i.i952 + +if.then.i.i952: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995 + call void @_ZdlPv(i8* %618) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995, %if.then.i.i952 + call void @llvm.lifetime.end(i64 32, i8* nonnull %480) #2 + %619 = load i8*, i8** %_M_p.i.i1532, align 8, !tbaa !62 + %arraydecay.i.i.i.i945 = bitcast %union.anon* %471 to i8* + %cmp.i.i.i946 = icmp eq i8* %619, %arraydecay.i.i.i.i945 + br i1 %cmp.i.i.i946, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948, label %if.then.i.i947 + +if.then.i.i947: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953 + call void @_ZdlPv(i8* %619) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953, %if.then.i.i947 + call void @llvm.lifetime.end(i64 32, i8* nonnull %463) #2 + %620 = load i8*, i8** %_M_p.i.i1489, align 8, !tbaa !62 + %arraydecay.i.i.i.i903 = bitcast %union.anon* %454 to i8* + %cmp.i.i.i904 = icmp eq i8* %620, %arraydecay.i.i.i.i903 + br i1 %cmp.i.i.i904, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906, label %if.then.i.i905 + +if.then.i.i905: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948 + call void @_ZdlPv(i8* %620) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948, %if.then.i.i905 + call void @llvm.lifetime.end(i64 32, i8* nonnull %446) #2 + %621 = load i8*, i8** %_M_p.i.i1436, align 8, !tbaa !62 + %arraydecay.i.i.i.i898 = bitcast %union.anon* %437 to i8* + %cmp.i.i.i899 = icmp eq i8* %621, %arraydecay.i.i.i.i898 + br i1 %cmp.i.i.i899, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901, label %if.then.i.i900 + +if.then.i.i900: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906 + call void @_ZdlPv(i8* %621) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906, %if.then.i.i900 + call void @llvm.lifetime.end(i64 32, i8* nonnull %429) #2 + %622 = load i8*, i8** %_M_p.i.i1378, align 8, !tbaa !62 + %arraydecay.i.i.i.i856 = bitcast %union.anon* %420 to i8* + %cmp.i.i.i857 = icmp eq i8* %622, %arraydecay.i.i.i.i856 + br i1 %cmp.i.i.i857, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859, label %if.then.i.i858 + +if.then.i.i858: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901 + call void @_ZdlPv(i8* %622) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901, %if.then.i.i858 + call void @llvm.lifetime.end(i64 32, i8* nonnull %412) #2 + %623 = load i8*, i8** %_M_p.i.i1341, align 8, !tbaa !62 + %arraydecay.i.i.i.i851 = bitcast %union.anon* %403 to i8* + %cmp.i.i.i852 = icmp eq i8* %623, %arraydecay.i.i.i.i851 + br i1 %cmp.i.i.i852, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854, label %if.then.i.i853 + +if.then.i.i853: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859 + call void @_ZdlPv(i8* %623) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859, %if.then.i.i853 + call void @llvm.lifetime.end(i64 32, i8* nonnull %395) #2 + %624 = load i8*, i8** %_M_p.i.i1273, align 8, !tbaa !62 + %arraydecay.i.i.i.i809 = bitcast %union.anon* %386 to i8* + %cmp.i.i.i810 = icmp eq i8* %624, %arraydecay.i.i.i.i809 + br i1 %cmp.i.i.i810, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812, label %if.then.i.i811 + +if.then.i.i811: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854 + call void @_ZdlPv(i8* %624) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854, %if.then.i.i811 + call void @llvm.lifetime.end(i64 32, i8* nonnull %378) #2 + %625 = load i8*, i8** %_M_p.i.i1229, align 8, !tbaa !62 + %arraydecay.i.i.i.i804 = bitcast %union.anon* %369 to i8* + %cmp.i.i.i805 = icmp eq i8* %625, %arraydecay.i.i.i.i804 + br i1 %cmp.i.i.i805, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807, label %if.then.i.i806 + +if.then.i.i806: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812 + call void @_ZdlPv(i8* %625) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812, %if.then.i.i806 + call void @llvm.lifetime.end(i64 32, i8* nonnull %361) #2 + %626 = load i8*, i8** %_M_p.i.i1191, align 8, !tbaa !62 + %arraydecay.i.i.i.i762 = bitcast %union.anon* %352 to i8* + %cmp.i.i.i763 = icmp eq i8* %626, %arraydecay.i.i.i.i762 + br i1 %cmp.i.i.i763, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765, label %if.then.i.i764 + +if.then.i.i764: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807 + call void @_ZdlPv(i8* %626) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807, %if.then.i.i764 + call void @llvm.lifetime.end(i64 32, i8* nonnull %344) #2 + %627 = load i8*, i8** %_M_p.i.i1154, align 8, !tbaa !62 + %arraydecay.i.i.i.i757 = bitcast %union.anon* %335 to i8* + %cmp.i.i.i758 = icmp eq i8* %627, %arraydecay.i.i.i.i757 + br i1 %cmp.i.i.i758, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760, label %if.then.i.i759 + +if.then.i.i759: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765 + call void @_ZdlPv(i8* %627) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765, %if.then.i.i759 + call void @llvm.lifetime.end(i64 32, i8* nonnull %327) #2 + %628 = load i8*, i8** %_M_p.i.i1117, align 8, !tbaa !62 + %arraydecay.i.i.i.i715 = bitcast %union.anon* %318 to i8* + %cmp.i.i.i716 = icmp eq i8* %628, %arraydecay.i.i.i.i715 + br i1 %cmp.i.i.i716, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718, label %if.then.i.i717 + +if.then.i.i717: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760 + call void @_ZdlPv(i8* %628) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760, %if.then.i.i717 + call void @llvm.lifetime.end(i64 32, i8* nonnull %310) #2 + %629 = load i8*, i8** %_M_p.i.i1079, align 8, !tbaa !62 + %arraydecay.i.i.i.i710 = bitcast %union.anon* %301 to i8* + %cmp.i.i.i711 = icmp eq i8* %629, %arraydecay.i.i.i.i710 + br i1 %cmp.i.i.i711, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713, label %if.then.i.i712 + +if.then.i.i712: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718 + call void @_ZdlPv(i8* %629) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718, %if.then.i.i712 + call void @llvm.lifetime.end(i64 32, i8* nonnull %293) #2 + %630 = load i8*, i8** %_M_p.i.i1037, align 8, !tbaa !62 + %arraydecay.i.i.i.i668 = bitcast %union.anon* %284 to i8* + %cmp.i.i.i669 = icmp eq i8* %630, %arraydecay.i.i.i.i668 + br i1 %cmp.i.i.i669, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671, label %if.then.i.i670 + +if.then.i.i670: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713 + call void @_ZdlPv(i8* %630) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713, %if.then.i.i670 + call void @llvm.lifetime.end(i64 32, i8* nonnull %276) #2 + %631 = load i8*, i8** %_M_p.i.i990, align 8, !tbaa !62 + %arraydecay.i.i.i.i663 = bitcast %union.anon* %267 to i8* + %cmp.i.i.i664 = icmp eq i8* %631, %arraydecay.i.i.i.i663 + br i1 %cmp.i.i.i664, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666, label %if.then.i.i665 + +if.then.i.i665: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671 + call void @_ZdlPv(i8* %631) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671, %if.then.i.i665 + call void @llvm.lifetime.end(i64 32, i8* nonnull %259) #2 + %632 = load i8*, i8** %_M_p.i.i943, align 8, !tbaa !62 + %arraydecay.i.i.i.i621 = bitcast %union.anon* %250 to i8* + %cmp.i.i.i622 = icmp eq i8* %632, %arraydecay.i.i.i.i621 + br i1 %cmp.i.i.i622, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624, label %if.then.i.i623 + +if.then.i.i623: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666 + call void @_ZdlPv(i8* %632) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666, %if.then.i.i623 + call void @llvm.lifetime.end(i64 32, i8* nonnull %242) #2 + %633 = load i8*, i8** %_M_p.i.i896, align 8, !tbaa !62 + %arraydecay.i.i.i.i616 = bitcast %union.anon* %233 to i8* + %cmp.i.i.i617 = icmp eq i8* %633, %arraydecay.i.i.i.i616 + br i1 %cmp.i.i.i617, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619, label %if.then.i.i618 + +if.then.i.i618: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624 + call void @_ZdlPv(i8* %633) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624, %if.then.i.i618 + call void @llvm.lifetime.end(i64 32, i8* nonnull %225) #2 + %634 = load i8*, i8** %_M_p.i.i849, align 8, !tbaa !62 + %arraydecay.i.i.i.i574 = bitcast %union.anon* %216 to i8* + %cmp.i.i.i575 = icmp eq i8* %634, %arraydecay.i.i.i.i574 + br i1 %cmp.i.i.i575, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577, label %if.then.i.i576 + +if.then.i.i576: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619 + call void @_ZdlPv(i8* %634) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619, %if.then.i.i576 + call void @llvm.lifetime.end(i64 32, i8* nonnull %208) #2 + %635 = load i8*, i8** %_M_p.i.i802, align 8, !tbaa !62 + %arraydecay.i.i.i.i569 = bitcast %union.anon* %199 to i8* + %cmp.i.i.i570 = icmp eq i8* %635, %arraydecay.i.i.i.i569 + br i1 %cmp.i.i.i570, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572, label %if.then.i.i571 + +if.then.i.i571: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577 + call void @_ZdlPv(i8* %635) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577, %if.then.i.i571 + call void @llvm.lifetime.end(i64 32, i8* nonnull %191) #2 + %636 = load i8*, i8** %_M_p.i.i755, align 8, !tbaa !62 + %arraydecay.i.i.i.i527 = bitcast %union.anon* %182 to i8* + %cmp.i.i.i528 = icmp eq i8* %636, %arraydecay.i.i.i.i527 + br i1 %cmp.i.i.i528, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530, label %if.then.i.i529 + +if.then.i.i529: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572 + call void @_ZdlPv(i8* %636) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572, %if.then.i.i529 + call void @llvm.lifetime.end(i64 32, i8* nonnull %174) #2 + %637 = load i8*, i8** %_M_p.i.i708, align 8, !tbaa !62 + %arraydecay.i.i.i.i522 = bitcast %union.anon* %165 to i8* + %cmp.i.i.i523 = icmp eq i8* %637, %arraydecay.i.i.i.i522 + br i1 %cmp.i.i.i523, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525, label %if.then.i.i524 + +if.then.i.i524: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530 + call void @_ZdlPv(i8* %637) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530, %if.then.i.i524 + call void @llvm.lifetime.end(i64 32, i8* nonnull %157) #2 + %638 = load i8*, i8** %_M_p.i.i661, align 8, !tbaa !62 + %arraydecay.i.i.i.i480 = bitcast %union.anon* %148 to i8* + %cmp.i.i.i481 = icmp eq i8* %638, %arraydecay.i.i.i.i480 + br i1 %cmp.i.i.i481, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483, label %if.then.i.i482 + +if.then.i.i482: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525 + call void @_ZdlPv(i8* %638) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525, %if.then.i.i482 + call void @llvm.lifetime.end(i64 32, i8* nonnull %140) #2 + %639 = load i8*, i8** %_M_p.i.i614, align 8, !tbaa !62 + %arraydecay.i.i.i.i475 = bitcast %union.anon* %131 to i8* + %cmp.i.i.i476 = icmp eq i8* %639, %arraydecay.i.i.i.i475 + br i1 %cmp.i.i.i476, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478, label %if.then.i.i477 + +if.then.i.i477: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483 + call void @_ZdlPv(i8* %639) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483, %if.then.i.i477 + call void @llvm.lifetime.end(i64 32, i8* nonnull %123) #2 + %640 = load i8*, i8** %_M_p.i.i567, align 8, !tbaa !62 + %arraydecay.i.i.i.i433 = bitcast %union.anon* %114 to i8* + %cmp.i.i.i434 = icmp eq i8* %640, %arraydecay.i.i.i.i433 + br i1 %cmp.i.i.i434, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436, label %if.then.i.i435 + +if.then.i.i435: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478 + call void @_ZdlPv(i8* %640) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478, %if.then.i.i435 + call void @llvm.lifetime.end(i64 32, i8* nonnull %106) #2 + %641 = load i8*, i8** %_M_p.i.i520, align 8, !tbaa !62 + %arraydecay.i.i.i.i428 = bitcast %union.anon* %97 to i8* + %cmp.i.i.i429 = icmp eq i8* %641, %arraydecay.i.i.i.i428 + br i1 %cmp.i.i.i429, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431, label %if.then.i.i430 + +if.then.i.i430: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436 + call void @_ZdlPv(i8* %641) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436, %if.then.i.i430 + call void @llvm.lifetime.end(i64 32, i8* nonnull %89) #2 + %642 = load i8*, i8** %_M_p.i.i473, align 8, !tbaa !62 + %arraydecay.i.i.i.i386 = bitcast %union.anon* %80 to i8* + %cmp.i.i.i387 = icmp eq i8* %642, %arraydecay.i.i.i.i386 + br i1 %cmp.i.i.i387, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389, label %if.then.i.i388 + +if.then.i.i388: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431 + call void @_ZdlPv(i8* %642) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431, %if.then.i.i388 + call void @llvm.lifetime.end(i64 32, i8* nonnull %72) #2 + %643 = load i8*, i8** %_M_p.i.i426, align 8, !tbaa !62 + %arraydecay.i.i.i.i381 = bitcast %union.anon* %63 to i8* + %cmp.i.i.i382 = icmp eq i8* %643, %arraydecay.i.i.i.i381 + br i1 %cmp.i.i.i382, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384, label %if.then.i.i383 + +if.then.i.i383: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389 + call void @_ZdlPv(i8* %643) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389, %if.then.i.i383 + call void @llvm.lifetime.end(i64 32, i8* nonnull %55) #2 + %644 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !62 + %arraydecay.i.i.i.i340 = bitcast %union.anon* %46 to i8* + %cmp.i.i.i341 = icmp eq i8* %644, %arraydecay.i.i.i.i340 + br i1 %cmp.i.i.i341, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343, label %if.then.i.i342 + +if.then.i.i342: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384 + call void @_ZdlPv(i8* %644) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384, %if.then.i.i342 + call void @llvm.lifetime.end(i64 32, i8* nonnull %38) #2 + %645 = load i8*, i8** %_M_p.i.i1192, align 8, !tbaa !62 + %arraydecay.i.i.i.i299 = bitcast %union.anon* %30 to i8* + %cmp.i.i.i300 = icmp eq i8* %645, %arraydecay.i.i.i.i299 + br i1 %cmp.i.i.i300, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302, label %if.then.i.i301 + +if.then.i.i301: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343 + call void @_ZdlPv(i8* %645) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343, %if.then.i.i301 + call void @llvm.lifetime.end(i64 32, i8* nonnull %22) #2 + %646 = load i8*, i8** %_M_p.i.i1230, align 8, !tbaa !62 + %arraydecay.i.i.i.i270 = bitcast %union.anon* %14 to i8* + %cmp.i.i.i271 = icmp eq i8* %646, %arraydecay.i.i.i.i270 + br i1 %cmp.i.i.i271, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273, label %if.then.i.i272 + +if.then.i.i272: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302 + call void @_ZdlPv(i8* %646) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302, %if.then.i.i272 + call void @llvm.lifetime.end(i64 32, i8* nonnull %6) #2 + %647 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !62 + %cmp.i.i.i = icmp eq i8* %647, %3 br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i -if.then.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265 - call void @_ZdlPv(i8* %645) #7 +if.then.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273 + call void @_ZdlPv(i8* %647) #2 br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265, %if.then.i.i - call void @llvm.lifetime.end(i64 32, i8* nonnull %0) #7 +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273, %if.then.i.i + call void @llvm.lifetime.end(i64 32, i8* nonnull %0) #2 ret i32 0 } -declare void @__visc__init() local_unnamed_addr #3 +declare void @__visc__init() local_unnamed_addr #0 + +declare void @startMemTracking() local_unnamed_addr #0 + +declare void @startProfiling() local_unnamed_addr #0 + +declare i8* @__visc__launch(i32, ...) local_unnamed_addr #0 + +declare void @__visc__wait(i8*) local_unnamed_addr #0 + +declare void @freeBatchMemory() local_unnamed_addr #0 -declare i8* @__visc__launch(i32, ...) local_unnamed_addr #3 +declare void @stopProfiling() local_unnamed_addr #0 -declare void @__visc__wait(i8*) local_unnamed_addr #3 +declare void @__visc__cleanup() local_unnamed_addr #0 -declare void @__visc__cleanup() local_unnamed_addr #3 +; Function Attrs: nounwind readnone +declare float @log10f(float) local_unnamed_addr #8 + +; Function Attrs: nounwind readnone +declare float @sqrtf(float) local_unnamed_addr #8 ; Function Attrs: nobuiltin nounwind -declare void @_ZdlPv(i8*) local_unnamed_addr #6 +declare void @_ZdlPv(i8*) local_unnamed_addr #9 -declare void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"*, %"class.std::basic_streambuf"*) local_unnamed_addr #3 +declare void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"*, %"class.std::basic_streambuf"*) local_unnamed_addr #0 ; Function Attrs: nounwind -declare void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"*) unnamed_addr #2 +declare void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"*) unnamed_addr #1 ; Function Attrs: nounwind -declare void @_ZNSt6localeC1Ev(%"class.std::locale"*) unnamed_addr #2 +declare void @_ZNSt6localeC1Ev(%"class.std::locale"*) unnamed_addr #1 ; Function Attrs: nounwind -declare void @_ZNSt6localeD1Ev(%"class.std::locale"*) unnamed_addr #2 +declare void @_ZNSt6localeD1Ev(%"class.std::locale"*) unnamed_addr #1 ; Function Attrs: nounwind -declare void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"*) unnamed_addr #2 +declare void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"*) unnamed_addr #1 -declare dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) local_unnamed_addr #3 +declare dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -declare void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* noalias sret, %"class.std::__cxx11::basic_stringbuf"*) local_unnamed_addr #0 align 2 +declare void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* noalias sret, %"class.std::__cxx11::basic_stringbuf"*) local_unnamed_addr #3 align 2 + +declare i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"*, i64* dereferenceable(8), i64) local_unnamed_addr #0 -declare i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"*, i64* dereferenceable(8), i64) local_unnamed_addr #3 +; Function Attrs: noreturn +declare void @_ZSt17__throw_bad_allocv() local_unnamed_addr #10 + +; Function Attrs: nobuiltin +declare noalias nonnull i8* @_Znwm(i64) local_unnamed_addr #11 ; Function Attrs: argmemonly nounwind -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #4 -declare dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"*, i64, i64, i8*, i64) local_unnamed_addr #3 +; Function Attrs: nounwind uwtable +define linkonce_odr void @_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_(%struct.ClassProb* %__first.coerce, %struct.ClassProb* %__last.coerce, i64 %__depth_limit, i1 (i64, i64)* %__comp.coerce) local_unnamed_addr #3 comdat { +entry: + %0 = ptrtoint %struct.ClassProb* %__first.coerce to i64 + %1 = ptrtoint %struct.ClassProb* %__last.coerce to i64 + %sub.ptr.sub.i33 = sub i64 %1, %0 + %cmp35 = icmp sgt i64 %sub.ptr.sub.i33, 128 + br i1 %cmp35, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %add.ptr.i33.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 1 + %agg.tmp.sroa.0.0..sroa_cast.i.i36.i = bitcast %struct.ClassProb* %add.ptr.i33.i to i64* + %.sink95.i.i = bitcast %struct.ClassProb* %__first.coerce to i64* + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit + %sub.ptr.div.i39.in = phi i64 [ %sub.ptr.sub.i33, %while.body.lr.ph ], [ %sub.ptr.sub.i, %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit ] + %__depth_limit.addr.037 = phi i64 [ %__depth_limit, %while.body.lr.ph ], [ %dec, %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit ] + %__first.sroa.0.1.i.i.sink36 = phi %struct.ClassProb* [ %__last.coerce, %while.body.lr.ph ], [ %__first.sroa.0.1.i.i, %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit ] + %cmp3 = icmp eq i64 %__depth_limit.addr.037, 0 + br i1 %cmp3, label %while.body.i.preheader.i, label %if.end + +while.body.i.preheader.i: ; preds = %while.body + tail call void @_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_(%struct.ClassProb* %__first.coerce, %struct.ClassProb* %__first.sroa.0.1.i.i.sink36, i1 (i64, i64)* %__comp.coerce) #2 + br label %while.body.i.i + +while.body.i.i: ; preds = %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i, %while.body.i.preheader.i + %__last.sroa.0.0.in14.i.i = phi %struct.ClassProb* [ %incdec.ptr.i.i17.i, %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i ], [ %__first.sroa.0.1.i.i.sink36, %while.body.i.preheader.i ] + %incdec.ptr.i.i17.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__last.sroa.0.0.in14.i.i, i64 -1 + %2 = ptrtoint %struct.ClassProb* %incdec.ptr.i.i17.i to i64 + %3 = bitcast %struct.ClassProb* %incdec.ptr.i.i17.i to i64* + %4 = load i64, i64* %3, align 4 + %5 = load i64, i64* %.sink95.i.i, align 4 + store i64 %5, i64* %3, align 4 + %sub.ptr.sub.i.i18.i = sub i64 %2, %0 + %sub.ptr.div.i.i.i = ashr exact i64 %sub.ptr.sub.i.i18.i, 3 + %sub.i.i.i = add nsw i64 %sub.ptr.div.i.i.i, -1 + %div.i.i.i = sdiv i64 %sub.i.i.i, 2 + %cmp84.i.i.i = icmp sgt i64 %sub.i.i.i, 1 + br i1 %cmp84.i.i.i, label %while.body.i.i.i.preheader, label %while.end.i.i.i + +while.body.i.i.i.preheader: ; preds = %while.body.i.i + br label %while.body.i.i.i + +while.body.i.i.i: ; preds = %while.body.i.i.i.preheader, %while.body.i.i.i + %__secondChild.085.i.i.i = phi i64 [ %dec.mul.i.i.i, %while.body.i.i.i ], [ 0, %while.body.i.i.i.preheader ] + %add.i.i.i = shl i64 %__secondChild.085.i.i.i, 1 + %mul.i.i.i = add i64 %add.i.i.i, 2 + %add.ptr.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %mul.i.i.i + %sub4.i.i.i = or i64 %add.i.i.i, 1 + %add.ptr.i66.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub4.i.i.i + %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i = bitcast %struct.ClassProb* %add.ptr.i.i.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i.i.i = bitcast %struct.ClassProb* %add.ptr.i66.i.i.i to i64* + %agg.tmp3.sroa.0.0.copyload.i.i.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i.i.i, align 4 + %call5.i.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i.i.i) #2 + %dec.mul.i.i.i = select i1 %call5.i.i.i.i, i64 %sub4.i.i.i, i64 %mul.i.i.i + %add.ptr.i78.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %dec.mul.i.i.i + %add.ptr.i75.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.085.i.i.i + %6 = bitcast %struct.ClassProb* %add.ptr.i78.i.i.i to i64* + %7 = bitcast %struct.ClassProb* %add.ptr.i75.i.i.i to i64* + %8 = load i64, i64* %6, align 4 + store i64 %8, i64* %7, align 4 + %cmp.i.i19.i = icmp slt i64 %dec.mul.i.i.i, %div.i.i.i + br i1 %cmp.i.i19.i, label %while.body.i.i.i, label %while.end.i.i.i.loopexit + +while.end.i.i.i.loopexit: ; preds = %while.body.i.i.i + br label %while.end.i.i.i + +while.end.i.i.i: ; preds = %while.end.i.i.i.loopexit, %while.body.i.i + %__secondChild.0.lcssa.i.i.i = phi i64 [ 0, %while.body.i.i ], [ %dec.mul.i.i.i, %while.end.i.i.i.loopexit ] + %and.i.i.i = and i64 %sub.ptr.div.i.i.i, 1 + %cmp18.i.i.i = icmp eq i64 %and.i.i.i, 0 + br i1 %cmp18.i.i.i, label %land.lhs.true.i.i.i, label %if.end36.i.i.i + +land.lhs.true.i.i.i: ; preds = %while.end.i.i.i + %sub19.i.i.i = add nsw i64 %sub.ptr.div.i.i.i, -2 + %div20.i.i.i = sdiv i64 %sub19.i.i.i, 2 + %cmp21.i.i.i = icmp eq i64 %__secondChild.0.lcssa.i.i.i, %div20.i.i.i + br i1 %cmp21.i.i.i, label %if.then22.i.i.i, label %if.end36.i.i.i + +if.then22.i.i.i: ; preds = %land.lhs.true.i.i.i + %add23.i.i.i = shl i64 %__secondChild.0.lcssa.i.i.i, 1 + %sub25.i.i.i = or i64 %add23.i.i.i, 1 + %add.ptr.i72.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub25.i.i.i + %add.ptr.i69.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.0.lcssa.i.i.i + %9 = bitcast %struct.ClassProb* %add.ptr.i72.i.i.i to i64* + %10 = bitcast %struct.ClassProb* %add.ptr.i69.i.i.i to i64* + %11 = load i64, i64* %9, align 4 + store i64 %11, i64* %10, align 4 + br label %if.end36.i.i.i + +if.end36.i.i.i: ; preds = %if.then22.i.i.i, %land.lhs.true.i.i.i, %while.end.i.i.i + %__holeIndex.addr.1.i.i.i = phi i64 [ %sub25.i.i.i, %if.then22.i.i.i ], [ %__secondChild.0.lcssa.i.i.i, %land.lhs.true.i.i.i ], [ %__secondChild.0.lcssa.i.i.i, %while.end.i.i.i ] + %cmp42.i.i.i.i = icmp sgt i64 %__holeIndex.addr.1.i.i.i, 0 + br i1 %cmp42.i.i.i.i, label %land.rhs.i.i.i.i.preheader, label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i + +land.rhs.i.i.i.i.preheader: ; preds = %if.end36.i.i.i + br label %land.rhs.i.i.i.i + +land.rhs.i.i.i.i: ; preds = %land.rhs.i.i.i.i.preheader, %while.body.i.i.i.i + %__parent.044.in.in.i.i.i.i = phi i64 [ %__parent.044.i.i.i.i, %while.body.i.i.i.i ], [ %__holeIndex.addr.1.i.i.i, %land.rhs.i.i.i.i.preheader ] + %__parent.044.in.i.i.i.i = add nsw i64 %__parent.044.in.in.i.i.i.i, -1 + %__parent.044.i.i.i.i = sdiv i64 %__parent.044.in.i.i.i.i, 2 + %add.ptr.i.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.i.i.i.i + %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i.i = bitcast %struct.ClassProb* %add.ptr.i.i.i.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i.i, align 4 + %call3.i.i.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i.i.i, i64 %4) #2 + br i1 %call3.i.i.i.i.i, label %while.body.i.i.i.i, label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit + +while.body.i.i.i.i: ; preds = %land.rhs.i.i.i.i + %add.ptr.i32.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.in.in.i.i.i.i + %12 = bitcast %struct.ClassProb* %add.ptr.i32.i.i.i.i to i64* + %13 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i.i, align 4 + store i64 %13, i64* %12, align 4 + %cmp.i.i.i.i = icmp sgt i64 %__parent.044.in.i.i.i.i, 1 + br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit + +_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit: ; preds = %land.rhs.i.i.i.i, %while.body.i.i.i.i + %__parent.0.in.in.lcssa.i.i.i.i.ph = phi i64 [ %__parent.044.i.i.i.i, %while.body.i.i.i.i ], [ %__parent.044.in.in.i.i.i.i, %land.rhs.i.i.i.i ] + br label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i + +_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i: ; preds = %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit, %if.end36.i.i.i + %__parent.0.in.in.lcssa.i.i.i.i = phi i64 [ %__holeIndex.addr.1.i.i.i, %if.end36.i.i.i ], [ %__parent.0.in.in.lcssa.i.i.i.i.ph, %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit ] + %add.ptr.i29.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.in.in.lcssa.i.i.i.i + %14 = bitcast %struct.ClassProb* %add.ptr.i29.i.i.i.i to i64* + store i64 %4, i64* %14, align 4 + %cmp.i.i = icmp sgt i64 %sub.ptr.sub.i.i18.i, 8 + br i1 %cmp.i.i, label %while.body.i.i, label %while.end.loopexit + +if.end: ; preds = %while.body + %sub.ptr.div.i3943 = lshr i64 %sub.ptr.div.i39.in, 4 + %dec = add nsw i64 %__depth_limit.addr.037, -1 + %add.ptr.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub.ptr.div.i3943 + %add.ptr.i42.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.sroa.0.1.i.i.sink36, i64 -1 + %agg.tmp.sroa.0.0.copyload.i.i37.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i = bitcast %struct.ClassProb* %add.ptr.i.i to i64* + %agg.tmp3.sroa.0.0.copyload.i.i39.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, align 4 + %call5.i.i40.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i37.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i39.i) #2 + br i1 %call5.i.i40.i, label %if.then.i.i, label %if.else34.i.i + +if.then.i.i: ; preds = %if.end + %agg.tmp.sroa.0.0.copyload.i66.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i = bitcast %struct.ClassProb* %add.ptr.i42.i to i64* + %agg.tmp3.sroa.0.0.copyload.i68.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i, align 4 + %call5.i69.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i66.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i68.i.i) #2 + br i1 %call5.i69.i.i, label %while.body.i.i28.preheader, label %if.else.i.i + +if.else.i.i: ; preds = %if.then.i.i + %agg.tmp.sroa.0.0.copyload.i78.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i80.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i, align 4 + %call5.i81.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i78.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i80.i.i) #2 + %agg.tmp3.sroa.0.0..sroa_cast.i67.agg.tmp.sroa.0.0..sroa_cast.i.i.i = select i1 %call5.i81.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i + br label %while.body.i.i28.preheader + +if.else34.i.i: ; preds = %if.end + %agg.tmp.sroa.0.0.copyload.i84.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i = bitcast %struct.ClassProb* %add.ptr.i42.i to i64* + %agg.tmp3.sroa.0.0.copyload.i86.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i, align 4 + %call5.i87.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i84.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i86.i.i) #2 + br i1 %call5.i87.i.i, label %while.body.i.i28.preheader, label %if.else45.i.i + +if.else45.i.i: ; preds = %if.else34.i.i + %agg.tmp.sroa.0.0.copyload.i72.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i74.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i, align 4 + %call5.i75.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i72.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i74.i.i) #2 + %agg.tmp3.sroa.0.0..sroa_cast.i85.agg.tmp3.sroa.0.0..sroa_cast.i.i.i = select i1 %call5.i75.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i + br label %while.body.i.i28.preheader + +while.body.i.i28.preheader: ; preds = %if.else45.i.i, %if.else34.i.i, %if.else.i.i, %if.then.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i.ph = phi i64* [ %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, %if.then.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i67.agg.tmp.sroa.0.0..sroa_cast.i.i.i, %if.else.i.i ], [ %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, %if.else34.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i85.agg.tmp3.sroa.0.0..sroa_cast.i.i.i, %if.else45.i.i ] + br label %while.body.i.i28 + +while.body.i.i28: ; preds = %while.body.i.i28.preheader, %while.end19.i.i + %agg.tmp.sroa.0.0..sroa_cast.i.i.sink47.i = phi i64* [ %agg.tmp.sroa.0.0..sroa_cast.i.i.i, %while.end19.i.i ], [ %.sink95.i.i, %while.body.i.i28.preheader ] + %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i = phi i64* [ %agg.tmp3.sroa.0.0..sroa_cast.i34.i.i, %while.end19.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i.ph, %while.body.i.i28.preheader ] + %__last.sroa.0.0.i.i = phi %struct.ClassProb* [ %incdec.ptr.i30.i.i, %while.end19.i.i ], [ %__first.sroa.0.1.i.i.sink36, %while.body.i.i28.preheader ] + %__first.sroa.0.0.i.i = phi %struct.ClassProb* [ %incdec.ptr.i.i.i, %while.end19.i.i ], [ %add.ptr.i33.i, %while.body.i.i28.preheader ] + %15 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.sink47.i, align 4 + %16 = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i, align 4 + store i64 %16, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.sink47.i, align 4 + store i64 %15, i64* %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i, align 4 + br label %while.cond4.i.i + +while.cond4.i.i: ; preds = %while.cond4.i.i, %while.body.i.i28 + %__first.sroa.0.1.i.i = phi %struct.ClassProb* [ %__first.sroa.0.0.i.i, %while.body.i.i28 ], [ %incdec.ptr.i.i.i, %while.cond4.i.i ] + %agg.tmp.sroa.0.0..sroa_cast.i.i.i = bitcast %struct.ClassProb* %__first.sroa.0.1.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i.i.i = load i64, i64* %.sink95.i.i, align 4 + %call5.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i.i) #2 + %incdec.ptr.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.sroa.0.1.i.i, i64 1 + br i1 %call5.i.i.i, label %while.cond4.i.i, label %while.cond11.i.i.preheader + +while.cond11.i.i.preheader: ; preds = %while.cond4.i.i + br label %while.cond11.i.i + +while.cond11.i.i: ; preds = %while.cond11.i.i.preheader, %while.cond11.i.i + %__last.sroa.0.1.sink.i.i = phi %struct.ClassProb* [ %incdec.ptr.i30.i.i, %while.cond11.i.i ], [ %__last.sroa.0.0.i.i, %while.cond11.i.i.preheader ] + %incdec.ptr.i30.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__last.sroa.0.1.sink.i.i, i64 -1 + %agg.tmp.sroa.0.0.copyload.i33.i.i = load i64, i64* %.sink95.i.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i34.i.i = bitcast %struct.ClassProb* %incdec.ptr.i30.i.i to i64* + %agg.tmp3.sroa.0.0.copyload.i35.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i34.i.i, align 4 + %call5.i36.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i33.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i35.i.i) #2 + br i1 %call5.i36.i.i, label %while.cond11.i.i, label %while.end19.i.i + +while.end19.i.i: ; preds = %while.cond11.i.i + %cmp.i.i.i = icmp ult %struct.ClassProb* %__first.sroa.0.1.i.i, %incdec.ptr.i30.i.i + br i1 %cmp.i.i.i, label %while.body.i.i28, label %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit + +_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit: ; preds = %while.end19.i.i + tail call void @_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_(%struct.ClassProb* %__first.sroa.0.1.i.i, %struct.ClassProb* %__first.sroa.0.1.i.i.sink36, i64 %dec, i1 (i64, i64)* %__comp.coerce) + %17 = ptrtoint %struct.ClassProb* %__first.sroa.0.1.i.i to i64 + %sub.ptr.sub.i = sub i64 %17, %0 + %cmp = icmp sgt i64 %sub.ptr.sub.i, 128 + br i1 %cmp, label %while.body, label %while.end.loopexit48 + +while.end.loopexit: ; preds = %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i + br label %while.end + +while.end.loopexit48: ; preds = %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit + br label %while.end + +while.end: ; preds = %while.end.loopexit48, %while.end.loopexit, %entry + ret void +} + +; Function Attrs: nounwind uwtable +define linkonce_odr void @_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_(%struct.ClassProb* %__first.coerce, %struct.ClassProb* %__last.coerce, i1 (i64, i64)* %__comp.coerce) local_unnamed_addr #3 comdat { +entry: + %0 = ptrtoint %struct.ClassProb* %__first.coerce to i64 + %1 = ptrtoint %struct.ClassProb* %__last.coerce to i64 + %sub.ptr.sub.i = sub i64 %1, %0 + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 3 + %cmp = icmp slt i64 %sub.ptr.sub.i, 16 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %sub = add nsw i64 %sub.ptr.div.i, -2 + %div = sdiv i64 %sub, 2 + %sub.i = add nsw i64 %sub.ptr.div.i, -1 + %div.i = sdiv i64 %sub.i, 2 + %and.i = and i64 %sub.ptr.div.i, 1 + %cmp18.i = icmp eq i64 %and.i, 0 + br i1 %cmp18.i, label %while.cond.us.preheader, label %while.cond.preheader + +while.cond.preheader: ; preds = %if.end + br label %while.cond + +while.cond.us.preheader: ; preds = %if.end + %add23.i.us = shl nsw i64 %div, 1 + %sub25.i.us = or i64 %add23.i.us, 1 + %add.ptr.i72.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub25.i.us + %add.ptr.i69.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %div + %2 = bitcast %struct.ClassProb* %add.ptr.i72.i.us to i64* + %3 = bitcast %struct.ClassProb* %add.ptr.i69.i.us to i64* + br label %while.cond.us + +while.cond.us: ; preds = %while.cond.us.preheader, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us + %__parent.0.us = phi i64 [ %dec.us, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us ], [ %div, %while.cond.us.preheader ] + %add.ptr.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.us + %4 = bitcast %struct.ClassProb* %add.ptr.i.us to i64* + %5 = load i64, i64* %4, align 4 + %cmp84.i.us = icmp sgt i64 %div.i, %__parent.0.us + br i1 %cmp84.i.us, label %while.body.i.us.preheader, label %while.end.i.us + +while.body.i.us.preheader: ; preds = %while.cond.us + br label %while.body.i.us + +while.body.i.us: ; preds = %while.body.i.us.preheader, %while.body.i.us + %__secondChild.085.i.us = phi i64 [ %dec.mul.i.us, %while.body.i.us ], [ %__parent.0.us, %while.body.i.us.preheader ] + %add.i.us = shl i64 %__secondChild.085.i.us, 1 + %mul.i.us = add i64 %add.i.us, 2 + %add.ptr.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %mul.i.us + %sub4.i.us = or i64 %add.i.us, 1 + %add.ptr.i66.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub4.i.us + %agg.tmp.sroa.0.0..sroa_cast.i.i.us = bitcast %struct.ClassProb* %add.ptr.i.i.us to i64* + %agg.tmp.sroa.0.0.copyload.i.i.us = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.us, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i.us = bitcast %struct.ClassProb* %add.ptr.i66.i.us to i64* + %agg.tmp3.sroa.0.0.copyload.i.i.us = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i.us, align 4 + %call5.i.i.us = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.us, i64 %agg.tmp3.sroa.0.0.copyload.i.i.us) #2 + %dec.mul.i.us = select i1 %call5.i.i.us, i64 %sub4.i.us, i64 %mul.i.us + %add.ptr.i78.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %dec.mul.i.us + %add.ptr.i75.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.085.i.us + %6 = bitcast %struct.ClassProb* %add.ptr.i78.i.us to i64* + %7 = bitcast %struct.ClassProb* %add.ptr.i75.i.us to i64* + %8 = load i64, i64* %6, align 4 + store i64 %8, i64* %7, align 4 + %cmp.i.us = icmp slt i64 %dec.mul.i.us, %div.i + br i1 %cmp.i.us, label %while.body.i.us, label %while.end.i.us.loopexit + +while.end.i.us.loopexit: ; preds = %while.body.i.us + br label %while.end.i.us + +while.end.i.us: ; preds = %while.end.i.us.loopexit, %while.cond.us + %__secondChild.0.lcssa.i.us = phi i64 [ %__parent.0.us, %while.cond.us ], [ %dec.mul.i.us, %while.end.i.us.loopexit ] + %cmp21.i.us = icmp eq i64 %__secondChild.0.lcssa.i.us, %div + br i1 %cmp21.i.us, label %if.then22.i.us, label %if.end36.i.us + +if.then22.i.us: ; preds = %while.end.i.us + %9 = load i64, i64* %2, align 4 + store i64 %9, i64* %3, align 4 + br label %if.end36.i.us + +if.end36.i.us: ; preds = %if.then22.i.us, %while.end.i.us + %__holeIndex.addr.1.i.us = phi i64 [ %sub25.i.us, %if.then22.i.us ], [ %__secondChild.0.lcssa.i.us, %while.end.i.us ] + %cmp42.i.i.us = icmp sgt i64 %__holeIndex.addr.1.i.us, %__parent.0.us + br i1 %cmp42.i.i.us, label %land.rhs.i.i.us.preheader, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us + +land.rhs.i.i.us.preheader: ; preds = %if.end36.i.us + br label %land.rhs.i.i.us + +land.rhs.i.i.us: ; preds = %land.rhs.i.i.us.preheader, %while.body.i.i.us + %__parent.044.in.in.i.i.us = phi i64 [ %__parent.044.i.i.us, %while.body.i.i.us ], [ %__holeIndex.addr.1.i.us, %land.rhs.i.i.us.preheader ] + %__parent.044.in.i.i.us = add nsw i64 %__parent.044.in.in.i.i.us, -1 + %__parent.044.i.i.us = sdiv i64 %__parent.044.in.i.i.us, 2 + %add.ptr.i.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.i.i.us + %agg.tmp.sroa.0.0..sroa_cast.i.i.i.us = bitcast %struct.ClassProb* %add.ptr.i.i.i.us to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i.us = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.us, align 4 + %call3.i.i.i.us = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i.us, i64 %5) #2 + br i1 %call3.i.i.i.us, label %while.body.i.i.us, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit + +while.body.i.i.us: ; preds = %land.rhs.i.i.us + %add.ptr.i32.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.in.in.i.i.us + %10 = bitcast %struct.ClassProb* %add.ptr.i32.i.i.us to i64* + %11 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.us, align 4 + store i64 %11, i64* %10, align 4 + %cmp.i.i.us = icmp sgt i64 %__parent.044.i.i.us, %__parent.0.us + br i1 %cmp.i.i.us, label %land.rhs.i.i.us, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit: ; preds = %while.body.i.i.us, %land.rhs.i.i.us + %__parent.0.in.in.lcssa.i.i.us.ph = phi i64 [ %__parent.044.i.i.us, %while.body.i.i.us ], [ %__parent.044.in.in.i.i.us, %land.rhs.i.i.us ] + br label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit, %if.end36.i.us + %__parent.0.in.in.lcssa.i.i.us = phi i64 [ %__holeIndex.addr.1.i.us, %if.end36.i.us ], [ %__parent.0.in.in.lcssa.i.i.us.ph, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit ] + %add.ptr.i29.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.in.in.lcssa.i.i.us + %12 = bitcast %struct.ClassProb* %add.ptr.i29.i.i.us to i64* + store i64 %5, i64* %12, align 4 + %cmp13.us = icmp eq i64 %__parent.0.us, 0 + %dec.us = add nsw i64 %__parent.0.us, -1 + br i1 %cmp13.us, label %return.loopexit, label %while.cond.us + +while.cond: ; preds = %while.cond.preheader, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + %__parent.0 = phi i64 [ %dec, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit ], [ %div, %while.cond.preheader ] + %add.ptr.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0 + %13 = bitcast %struct.ClassProb* %add.ptr.i to i64* + %14 = load i64, i64* %13, align 4 + %cmp84.i = icmp sgt i64 %div.i, %__parent.0 + br i1 %cmp84.i, label %while.body.i.preheader, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + +while.body.i.preheader: ; preds = %while.cond + br label %while.body.i + +while.body.i: ; preds = %while.body.i.preheader, %while.body.i + %__secondChild.085.i = phi i64 [ %dec.mul.i, %while.body.i ], [ %__parent.0, %while.body.i.preheader ] + %add.i = shl i64 %__secondChild.085.i, 1 + %mul.i = add i64 %add.i, 2 + %add.ptr.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %mul.i + %sub4.i = or i64 %add.i, 1 + %add.ptr.i66.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub4.i + %agg.tmp.sroa.0.0..sroa_cast.i.i = bitcast %struct.ClassProb* %add.ptr.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i = bitcast %struct.ClassProb* %add.ptr.i66.i to i64* + %agg.tmp3.sroa.0.0.copyload.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i, align 4 + %call5.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i) #2 + %dec.mul.i = select i1 %call5.i.i, i64 %sub4.i, i64 %mul.i + %add.ptr.i78.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %dec.mul.i + %add.ptr.i75.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.085.i + %15 = bitcast %struct.ClassProb* %add.ptr.i78.i to i64* + %16 = bitcast %struct.ClassProb* %add.ptr.i75.i to i64* + %17 = load i64, i64* %15, align 4 + store i64 %17, i64* %16, align 4 + %cmp.i = icmp slt i64 %dec.mul.i, %div.i + br i1 %cmp.i, label %while.body.i, label %if.end36.i + +if.end36.i: ; preds = %while.body.i + %cmp42.i.i = icmp sgt i64 %dec.mul.i, %__parent.0 + br i1 %cmp42.i.i, label %land.rhs.i.i.preheader, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + +land.rhs.i.i.preheader: ; preds = %if.end36.i + br label %land.rhs.i.i + +land.rhs.i.i: ; preds = %land.rhs.i.i.preheader, %while.body.i.i + %__parent.044.in.in.i.i = phi i64 [ %__parent.044.i.i, %while.body.i.i ], [ %dec.mul.i, %land.rhs.i.i.preheader ] + %__parent.044.in.i.i = add nsw i64 %__parent.044.in.in.i.i, -1 + %__parent.044.i.i = sdiv i64 %__parent.044.in.i.i, 2 + %add.ptr.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.i.i + %agg.tmp.sroa.0.0..sroa_cast.i.i.i = bitcast %struct.ClassProb* %add.ptr.i.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + %call3.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i, i64 %14) #2 + br i1 %call3.i.i.i, label %while.body.i.i, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit + +while.body.i.i: ; preds = %land.rhs.i.i + %add.ptr.i32.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.in.in.i.i + %18 = bitcast %struct.ClassProb* %add.ptr.i32.i.i to i64* + %19 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + store i64 %19, i64* %18, align 4 + %cmp.i.i = icmp sgt i64 %__parent.044.i.i, %__parent.0 + br i1 %cmp.i.i, label %land.rhs.i.i, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit: ; preds = %while.body.i.i, %land.rhs.i.i + %__parent.0.in.in.lcssa.i.i.ph = phi i64 [ %__parent.044.i.i, %while.body.i.i ], [ %__parent.044.in.in.i.i, %land.rhs.i.i ] + br label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit, %while.cond, %if.end36.i + %__parent.0.in.in.lcssa.i.i = phi i64 [ %dec.mul.i, %if.end36.i ], [ %__parent.0, %while.cond ], [ %__parent.0.in.in.lcssa.i.i.ph, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit ] + %add.ptr.i29.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.in.in.lcssa.i.i + %20 = bitcast %struct.ClassProb* %add.ptr.i29.i.i to i64* + store i64 %14, i64* %20, align 4 + %cmp13 = icmp eq i64 %__parent.0, 0 + %dec = add nsw i64 %__parent.0, -1 + br i1 %cmp13, label %return.loopexit34, label %while.cond + +return.loopexit: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us + br label %return + +return.loopexit34: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + br label %return + +return: ; preds = %return.loopexit34, %return.loopexit, %entry + ret void +} + +; Function Attrs: nounwind readnone +declare i64 @llvm.ctlz.i64(i64, i1) #12 + +declare dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"*, i64, i64, i8*, i64) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +define internal void @_GLOBAL__sub_I_vgg16_cifar10.cpp() #3 section ".text.startup" { +entry: + tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* nonnull @_ZStL8__ioinit) #2 + %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init", %"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* nonnull @__dso_handle) #2 + tail call void @llvm.memset.p0i8.i64(i8* bitcast (%"class.std::vector"* @run_accuracies to i8*), i8 0, i64 24, i32 8, i1 false) #2 + %1 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::vector"*)* @_ZNSt6vectorIfSaIfEED2Ev to void (i8*)*), i8* bitcast (%"class.std::vector"* @run_accuracies to i8*), i8* nonnull @__dso_handle) #2 + ret void +} ; Function Attrs: nounwind -declare i32 @puts(i8* nocapture readonly) #7 +declare i32 @puts(i8* nocapture readonly) #2 declare i32 @putchar(i32) +; Function Attrs: nounwind +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2 + ; Function Attrs: argmemonly nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #4 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #5 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #6 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #7 = { nounwind } -attributes #8 = { noreturn nounwind } +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #4 + +attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { nounwind } +attributes #3 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #4 = { argmemonly nounwind } +attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #6 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #7 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #8 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #10 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #11 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #12 = { nounwind readnone } +attributes #13 = { noreturn nounwind } !llvm.ident = !{!0} !0 = !{!"clang version 4.0.1 "} -!1 = !{!2, !7, i64 40} -!2 = !{!"_ZTS6Tensor", !3, i64 0, !3, i64 4, !6, i64 8, !7, i64 16, !7, i64 24, !7, i64 32, !7, i64 40, !8, i64 48, !8, i64 56, !9, i64 64} -!3 = !{!"int", !4, i64 0} -!4 = !{!"omnipotent char", !5, i64 0} -!5 = !{!"Simple C++ TBAA"} -!6 = !{!"_ZTS15data_location_t", !4, i64 0} -!7 = !{!"any pointer", !4, i64 0} -!8 = !{!"long", !4, i64 0} -!9 = !{!"_ZTS9Dimension", !3, i64 0, !7, i64 8} -!10 = !{!2, !3, i64 64} -!11 = !{!2, !7, i64 72} -!12 = !{!8, !8, i64 0} -!13 = !{!2, !8, i64 48} -!14 = !{!2, !8, i64 56} -!15 = !{!2, !7, i64 32} -!16 = !{!2, !3, i64 0} -!17 = !{!18, !18, i64 0} -!18 = !{!"float", !4, i64 0} -!19 = distinct !{!19, !20} -!20 = !{!"llvm.loop.unroll.disable"} -!21 = distinct !{!21, !22, !23} -!22 = !{!"llvm.loop.vectorize.width", i32 1} -!23 = !{!"llvm.loop.interleave.count", i32 1} -!24 = distinct !{!24, !22, !23} -!25 = distinct !{!25, !20} -!26 = distinct !{!26, !22, !23} -!27 = distinct !{!27, !20} -!28 = distinct !{!28, !22, !23} -!29 = distinct !{!29, !22, !23} -!30 = distinct !{!30, !22, !23} -!31 = distinct !{!31, !20} -!32 = distinct !{!32, !22, !23} -!33 = distinct !{!33, !22, !23} -!34 = distinct !{!34, !22, !23} -!35 = distinct !{!35, !22, !23} -!36 = !{!4, !4, i64 0} -!37 = distinct !{!37, !22, !23} -!38 = distinct !{!38, !39, !22, !23} -!39 = !{!"llvm.loop.unroll.runtime.disable"} -!40 = !{!41, !41, i64 0} -!41 = !{!"vtable pointer", !5, i64 0} -!42 = !{!43, !7, i64 216} -!43 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !7, i64 216, !4, i64 224, !44, i64 225, !7, i64 232, !7, i64 240, !7, i64 248, !7, i64 256} -!44 = !{!"bool", !4, i64 0} -!45 = !{!43, !4, i64 224} -!46 = !{!43, !44, i64 225} -!47 = !{!48, !49, i64 64} -!48 = !{!"_ZTSNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE", !49, i64 64, !50, i64 72} -!49 = !{!"_ZTSSt13_Ios_Openmode", !4, i64 0} -!50 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE", !51, i64 0, !8, i64 8, !4, i64 16} -!51 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderE", !7, i64 0} -!52 = !{!51, !7, i64 0} -!53 = !{!50, !8, i64 8} -!54 = !{!55, !55, i64 0} -!55 = !{!"_ZTSSt13_Ios_Fmtflags", !4, i64 0} -!56 = !{!50, !7, i64 0} -!57 = !{!58, !55, i64 24} -!58 = !{!"_ZTSSt8ios_base", !8, i64 8, !8, i64 16, !55, i64 24, !59, i64 28, !59, i64 32, !7, i64 40, !60, i64 48, !4, i64 64, !3, i64 192, !7, i64 200, !61, i64 208} -!59 = !{!"_ZTSSt12_Ios_Iostate", !4, i64 0} -!60 = !{!"_ZTSNSt8ios_base6_WordsE", !7, i64 0, !8, i64 8} -!61 = !{!"_ZTSSt6locale", !7, i64 0} -!62 = !{!63} -!63 = distinct !{!63, !64, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!64 = distinct !{!64, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!65 = !{!66} -!66 = distinct !{!66, !67, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!67 = distinct !{!67, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!68 = !{!69} -!69 = distinct !{!69, !70, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!70 = distinct !{!70, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!71 = !{!72} -!72 = distinct !{!72, !73, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!73 = distinct !{!73, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!74 = !{!75} -!75 = distinct !{!75, !76, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!76 = distinct !{!76, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!77 = !{!78} -!78 = distinct !{!78, !79, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!79 = distinct !{!79, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!1 = !{!2, !4, i64 0} +!2 = !{!"_ZTSSt12_Vector_baseIfSaIfEE", !3, i64 0} +!3 = !{!"_ZTSNSt12_Vector_baseIfSaIfEE12_Vector_implE", !4, i64 0, !4, i64 8, !4, i64 16} +!4 = !{!"any pointer", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C++ TBAA"} +!7 = !{!8, !4, i64 56} +!8 = !{!"_ZTS6Tensor", !9, i64 0, !9, i64 4, !9, i64 8, !10, i64 12, !4, i64 16, !4, i64 24, !4, i64 32, !4, i64 40, !4, i64 48, !4, i64 56, !4, i64 64, !11, i64 72, !11, i64 80, !12, i64 88} +!9 = !{!"int", !5, i64 0} +!10 = !{!"_ZTS15data_location_t", !5, i64 0} +!11 = !{!"long", !5, i64 0} +!12 = !{!"_ZTS9Dimension", !9, i64 0, !4, i64 8} +!13 = !{!8, !9, i64 88} +!14 = !{!8, !4, i64 96} +!15 = !{!11, !11, i64 0} +!16 = !{!8, !11, i64 72} +!17 = !{!8, !4, i64 48} +!18 = !{!8, !11, i64 80} +!19 = !{!8, !9, i64 0} +!20 = !{!21, !21, i64 0} +!21 = !{!"float", !5, i64 0} +!22 = distinct !{!22, !23} +!23 = !{!"llvm.loop.unroll.disable"} +!24 = distinct !{!24, !25, !26} +!25 = !{!"llvm.loop.vectorize.width", i32 1} +!26 = !{!"llvm.loop.interleave.count", i32 1} +!27 = distinct !{!27, !25, !26} +!28 = distinct !{!28, !23} +!29 = distinct !{!29, !25, !26} +!30 = distinct !{!30, !23} +!31 = distinct !{!31, !25, !26} +!32 = distinct !{!32, !25, !26} +!33 = distinct !{!33, !25, !26} +!34 = distinct !{!34, !23} +!35 = distinct !{!35, !25, !26} +!36 = distinct !{!36, !25, !26} +!37 = distinct !{!37, !23} +!38 = distinct !{!38, !25, !26} +!39 = distinct !{!39, !25, !26} +!40 = distinct !{!40, !25, !26} +!41 = distinct !{!41, !25, !26} +!42 = !{!5, !5, i64 0} +!43 = distinct !{!43, !25, !26} +!44 = distinct !{!44, !45, !25, !26} +!45 = !{!"llvm.loop.unroll.runtime.disable"} +!46 = !{!47, !47, i64 0} +!47 = !{!"vtable pointer", !6, i64 0} +!48 = !{!49, !4, i64 216} +!49 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !4, i64 216, !5, i64 224, !50, i64 225, !4, i64 232, !4, i64 240, !4, i64 248, !4, i64 256} +!50 = !{!"bool", !5, i64 0} +!51 = !{!49, !5, i64 224} +!52 = !{!49, !50, i64 225} +!53 = !{!54, !55, i64 64} +!54 = !{!"_ZTSNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE", !55, i64 64, !56, i64 72} +!55 = !{!"_ZTSSt13_Ios_Openmode", !5, i64 0} +!56 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE", !57, i64 0, !11, i64 8, !5, i64 16} +!57 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderE", !4, i64 0} +!58 = !{!57, !4, i64 0} +!59 = !{!56, !11, i64 8} +!60 = !{!61, !61, i64 0} +!61 = !{!"_ZTSSt13_Ios_Fmtflags", !5, i64 0} +!62 = !{!56, !4, i64 0} +!63 = distinct !{!63, !25, !26} +!64 = distinct !{!64, !25, !26} +!65 = !{!66, !61, i64 24} +!66 = !{!"_ZTSSt8ios_base", !11, i64 8, !11, i64 16, !61, i64 24, !67, i64 28, !67, i64 32, !4, i64 40, !68, i64 48, !5, i64 64, !9, i64 192, !4, i64 200, !69, i64 208} +!67 = !{!"_ZTSSt12_Ios_Iostate", !5, i64 0} +!68 = !{!"_ZTSNSt8ios_base6_WordsE", !4, i64 0, !11, i64 8} +!69 = !{!"_ZTSSt6locale", !4, i64 0} +!70 = !{!9, !9, i64 0} +!71 = distinct !{!71, !25, !26} +!72 = distinct !{!72, !45, !25, !26} +!73 = !{!4, !4, i64 0} +!74 = !{!2, !4, i64 8} +!75 = !{!2, !4, i64 16} +!76 = distinct !{!76, !25, !26} +!77 = distinct !{!77, !25, !26} +!78 = distinct !{!78, !45, !25, !26} +!79 = distinct !{!79, !45, !25, !26} !80 = !{!81} !81 = distinct !{!81, !82, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} !82 = distinct !{!82, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} @@ -5549,67 +8737,85 @@ attributes #8 = { noreturn nounwind } !155 = !{!156} !156 = distinct !{!156, !157, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} !157 = distinct !{!157, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!158 = !{!159, !7, i64 0} -!159 = !{!"_ZTS6RootIn", !7, i64 0, !8, i64 8, !7, i64 16, !8, i64 24, !7, i64 32, !8, i64 40, !7, i64 48, !8, i64 56, !7, i64 64, !8, i64 72, !7, i64 80, !8, i64 88, !7, i64 96, !8, i64 104, !7, i64 112, !8, i64 120, !7, i64 128, !8, i64 136, !7, i64 144, !8, i64 152, !7, i64 160, !8, i64 168, !7, i64 176, !8, i64 184, !7, i64 192, !8, i64 200, !7, i64 208, !8, i64 216, !7, i64 224, !8, i64 232, !7, i64 240, !8, i64 248, !7, i64 256, !8, i64 264, !7, i64 272, !8, i64 280, !7, i64 288, !8, i64 296, !7, i64 304, !8, i64 312, !7, i64 320, !8, i64 328, !7, i64 336, !8, i64 344, !7, i64 352, !8, i64 360, !7, i64 368, !8, i64 376, !7, i64 384, !8, i64 392, !7, i64 400, !8, i64 408, !7, i64 416, !8, i64 424, !7, i64 432, !8, i64 440, !7, i64 448, !8, i64 456, !7, i64 464, !8, i64 472, !7, i64 480, !8, i64 488, !160, i64 496} -!160 = !{!"_ZTS5ret_t", !7, i64 0, !8, i64 8} -!161 = !{!159, !8, i64 8} -!162 = !{!159, !7, i64 16} -!163 = !{!159, !8, i64 24} -!164 = !{!159, !7, i64 32} -!165 = !{!159, !8, i64 40} -!166 = !{!159, !7, i64 48} -!167 = !{!159, !8, i64 56} -!168 = !{!159, !7, i64 64} -!169 = !{!159, !8, i64 72} -!170 = !{!159, !7, i64 80} -!171 = !{!159, !8, i64 88} -!172 = !{!159, !7, i64 96} -!173 = !{!159, !8, i64 104} -!174 = !{!159, !7, i64 112} -!175 = !{!159, !8, i64 120} -!176 = !{!159, !7, i64 128} -!177 = !{!159, !8, i64 136} -!178 = !{!159, !7, i64 144} -!179 = !{!159, !8, i64 152} -!180 = !{!159, !7, i64 160} -!181 = !{!159, !8, i64 168} -!182 = !{!159, !7, i64 176} -!183 = !{!159, !8, i64 184} -!184 = !{!159, !7, i64 192} -!185 = !{!159, !8, i64 200} -!186 = !{!159, !7, i64 208} -!187 = !{!159, !8, i64 216} -!188 = !{!159, !7, i64 224} -!189 = !{!159, !8, i64 232} -!190 = !{!159, !7, i64 240} -!191 = !{!159, !8, i64 248} -!192 = !{!159, !7, i64 256} -!193 = !{!159, !8, i64 264} -!194 = !{!159, !7, i64 272} -!195 = !{!159, !8, i64 280} -!196 = !{!159, !7, i64 288} -!197 = !{!159, !8, i64 296} -!198 = !{!159, !7, i64 304} -!199 = !{!159, !8, i64 312} -!200 = !{!159, !7, i64 320} -!201 = !{!159, !8, i64 328} -!202 = !{!159, !7, i64 336} -!203 = !{!159, !8, i64 344} -!204 = !{!159, !7, i64 352} -!205 = !{!159, !8, i64 360} -!206 = !{!159, !7, i64 368} -!207 = !{!159, !8, i64 376} -!208 = !{!159, !7, i64 384} -!209 = !{!159, !8, i64 392} -!210 = !{!159, !7, i64 400} -!211 = !{!159, !8, i64 408} -!212 = !{!159, !7, i64 416} -!213 = !{!159, !8, i64 424} -!214 = !{!159, !7, i64 432} -!215 = !{!159, !8, i64 440} -!216 = !{!159, !7, i64 448} -!217 = !{!159, !8, i64 456} -!218 = !{!159, !7, i64 464} -!219 = !{!159, !8, i64 472} -!220 = !{!159, !7, i64 480} -!221 = !{!159, !8, i64 488} +!158 = !{!159} +!159 = distinct !{!159, !160, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!160 = distinct !{!160, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!161 = !{!162} +!162 = distinct !{!162, !163, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!163 = distinct !{!163, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!164 = !{!165} +!165 = distinct !{!165, !166, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!166 = distinct !{!166, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!167 = !{!168} +!168 = distinct !{!168, !169, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!169 = distinct !{!169, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!170 = !{!171} +!171 = distinct !{!171, !172, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!172 = distinct !{!172, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!173 = !{!174} +!174 = distinct !{!174, !175, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!175 = distinct !{!175, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!176 = !{!177, !4, i64 0} +!177 = !{!"_ZTS6RootIn", !4, i64 0, !11, i64 8, !4, i64 16, !11, i64 24, !4, i64 32, !11, i64 40, !4, i64 48, !11, i64 56, !4, i64 64, !11, i64 72, !4, i64 80, !11, i64 88, !4, i64 96, !11, i64 104, !4, i64 112, !11, i64 120, !4, i64 128, !11, i64 136, !4, i64 144, !11, i64 152, !4, i64 160, !11, i64 168, !4, i64 176, !11, i64 184, !4, i64 192, !11, i64 200, !4, i64 208, !11, i64 216, !4, i64 224, !11, i64 232, !4, i64 240, !11, i64 248, !4, i64 256, !11, i64 264, !4, i64 272, !11, i64 280, !4, i64 288, !11, i64 296, !4, i64 304, !11, i64 312, !4, i64 320, !11, i64 328, !4, i64 336, !11, i64 344, !4, i64 352, !11, i64 360, !4, i64 368, !11, i64 376, !4, i64 384, !11, i64 392, !4, i64 400, !11, i64 408, !4, i64 416, !11, i64 424, !4, i64 432, !11, i64 440, !4, i64 448, !11, i64 456, !4, i64 464, !11, i64 472, !4, i64 480, !11, i64 488, !178, i64 496} +!178 = !{!"_ZTS5ret_t", !4, i64 0, !11, i64 8} +!179 = !{!177, !11, i64 8} +!180 = !{!177, !4, i64 16} +!181 = !{!177, !11, i64 24} +!182 = !{!177, !4, i64 32} +!183 = !{!177, !11, i64 40} +!184 = !{!177, !4, i64 48} +!185 = !{!177, !11, i64 56} +!186 = !{!177, !4, i64 64} +!187 = !{!177, !11, i64 72} +!188 = !{!177, !4, i64 80} +!189 = !{!177, !11, i64 88} +!190 = !{!177, !4, i64 96} +!191 = !{!177, !11, i64 104} +!192 = !{!177, !4, i64 112} +!193 = !{!177, !11, i64 120} +!194 = !{!177, !4, i64 128} +!195 = !{!177, !11, i64 136} +!196 = !{!177, !4, i64 144} +!197 = !{!177, !11, i64 152} +!198 = !{!177, !4, i64 160} +!199 = !{!177, !11, i64 168} +!200 = !{!177, !4, i64 176} +!201 = !{!177, !11, i64 184} +!202 = !{!177, !4, i64 192} +!203 = !{!177, !11, i64 200} +!204 = !{!177, !4, i64 208} +!205 = !{!177, !11, i64 216} +!206 = !{!177, !4, i64 224} +!207 = !{!177, !11, i64 232} +!208 = !{!177, !4, i64 240} +!209 = !{!177, !11, i64 248} +!210 = !{!177, !4, i64 256} +!211 = !{!177, !11, i64 264} +!212 = !{!177, !4, i64 272} +!213 = !{!177, !11, i64 280} +!214 = !{!177, !4, i64 288} +!215 = !{!177, !11, i64 296} +!216 = !{!177, !4, i64 304} +!217 = !{!177, !11, i64 312} +!218 = !{!177, !4, i64 320} +!219 = !{!177, !11, i64 328} +!220 = !{!177, !4, i64 336} +!221 = !{!177, !11, i64 344} +!222 = !{!177, !4, i64 352} +!223 = !{!177, !11, i64 360} +!224 = !{!177, !4, i64 368} +!225 = !{!177, !11, i64 376} +!226 = !{!177, !4, i64 384} +!227 = !{!177, !11, i64 392} +!228 = !{!177, !4, i64 400} +!229 = !{!177, !11, i64 408} +!230 = !{!177, !4, i64 416} +!231 = !{!177, !11, i64 424} +!232 = !{!177, !4, i64 432} +!233 = !{!177, !11, i64 440} +!234 = !{!177, !4, i64 448} +!235 = !{!177, !11, i64 456} +!236 = !{!177, !4, i64 464} +!237 = !{!177, !11, i64 472} +!238 = !{!177, !4, i64 480} +!239 = !{!177, !11, i64 488} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.opt.bc b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.opt.bc deleted file mode 100644 index 3219048b57801e110245112716cccde0b8c4aa78..0000000000000000000000000000000000000000 Binary files a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.opt.bc and /dev/null differ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.visc.ll b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.visc.ll index bec01ca7d31361c6a7b39230407563c6c468f53e..d14dfd99b13d1a9e49c977510da83378359323a4 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.visc.ll +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10.visc.ll @@ -3,9 +3,13 @@ source_filename = "src/vgg16_cifar10.cpp" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" +%"class.std::ios_base::Init" = type { i8 } +%"class.std::vector" = type { %"struct.std::_Vector_base" } +%"struct.std::_Vector_base" = type { %"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" } +%"struct.std::_Vector_base<float, std::allocator<float> >::_Vector_impl" = type { float*, float*, float* } %struct._IO_FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } %struct._IO_marker = type { %struct._IO_marker*, %struct._IO_FILE*, i32 } -%struct.Tensor = type { i32, i32, i32, %struct.cudnnTensorStruct*, %struct.cudnnFilterStruct*, i8*, i8*, i64, i64, %struct.Dimension } +%struct.Tensor = type { i32, i32, i32, i32, %struct.cudnnTensorStruct*, %struct.cudnnFilterStruct*, %struct.cudnnTensorStruct*, %struct.cudnnFilterStruct*, i8*, i8*, i8*, i64, i64, %struct.Dimension } %struct.cudnnTensorStruct = type opaque %struct.cudnnFilterStruct = type opaque %struct.Dimension = type { i32, i64* } @@ -30,6 +34,7 @@ target triple = "x86_64-unknown-linux-gnu" %struct.__locale_data = type opaque %"class.std::num_put" = type { %"class.std::locale::facet.base", [4 x i8] } %"class.std::num_get" = type { %"class.std::locale::facet.base", [4 x i8] } +%struct.ClassProb = type { float, i32 } %struct.out._Z10var_0_nodePvmS_m = type <{ i8*, i64 }> %struct.out._Z10var_1_nodePvmS_m = type <{ i8*, i64 }> %struct.out._Z10var_2_nodePvm = type <{ i8*, i64 }> @@ -82,73 +87,125 @@ target triple = "x86_64-unknown-linux-gnu" %struct.out._Z11var_49_nodePvm = type <{ i8*, i64 }> %struct.out._Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m = type <{ i8*, i64 }> -@.str.1 = private unnamed_addr constant [19 x i8] c"tensor dims = %d \0A\00", align 1 -@.str.2 = private unnamed_addr constant [18 x i8] c"dim1_size = %zu \0A\00", align 1 -@.str.3 = private unnamed_addr constant [18 x i8] c"dim2_size = %zu \0A\00", align 1 -@.str.4 = private unnamed_addr constant [18 x i8] c"num_elems = %zu \0A\00", align 1 -@.str.5 = private unnamed_addr constant [3 x i8] c"wb\00", align 1 -@.str.6 = private unnamed_addr constant [58 x i8] c"File %s could not be created. Check if directory exists \0A\00", align 1 -@.str.7 = private unnamed_addr constant [22 x i8] c"size_in_bytes = %zu \0A\00", align 1 -@.str.8 = private unnamed_addr constant [21 x i8] c"bytes_written = %zu\0A\00", align 1 -@.str.9 = private unnamed_addr constant [4 x i8] c"%f,\00", align 1 -@.str.11 = private unnamed_addr constant [18 x i8] c"Num_elems = %zu \0A\00", align 1 -@.str.12 = private unnamed_addr constant [16 x i8] c"dim[%d] = %zu \0A\00", align 1 -@.str.13 = private unnamed_addr constant [35 x i8] c"Tensor data mismatch at index %d \0A\00", align 1 -@.str.14 = private unnamed_addr constant [21 x i8] c"Tensor data mismatch\00", align 1 -@.str.15 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 -@.str.16 = private unnamed_addr constant [41 x i8] c"Data file %s is not found. Aborting... \0A\00", align 1 -@.str.17 = private unnamed_addr constant [23 x i8] c"tensor_data[%d] = %f \0A\00", align 1 -@.str.18 = private unnamed_addr constant [40 x i8] c"Data file %s is not found. Aborting...\0A\00", align 1 -@.str.19 = private unnamed_addr constant [26 x i8] c"*Label bytes_read = %zu \0A\00", align 1 -@.str.20 = private unnamed_addr constant [24 x i8] c"****** Accuracy = %f \0A\0A\00", align 1 -@.str.21 = private unnamed_addr constant [15 x i8] c"final_accuracy\00", align 1 -@.str.22 = private unnamed_addr constant [3 x i8] c"w+\00", align 1 -@.str.23 = private unnamed_addr constant [72 x i8] c"../../../../../../projects/hpvm-tensor-rt/model_params/vgg16_cifar10_2/\00", align 1 -@.str.24 = private unnamed_addr constant [10 x i8] c"input.bin\00", align 1 -@.str.25 = private unnamed_addr constant [11 x i8] c"labels.bin\00", align 1 -@.str.26 = private unnamed_addr constant [15 x i8] c"conv2d_1_w.bin\00", align 1 -@.str.27 = private unnamed_addr constant [15 x i8] c"conv2d_1_b.bin\00", align 1 -@.str.28 = private unnamed_addr constant [15 x i8] c"conv2d_2_w.bin\00", align 1 -@.str.29 = private unnamed_addr constant [15 x i8] c"conv2d_2_b.bin\00", align 1 -@.str.30 = private unnamed_addr constant [15 x i8] c"conv2d_3_w.bin\00", align 1 -@.str.31 = private unnamed_addr constant [15 x i8] c"conv2d_3_b.bin\00", align 1 -@.str.32 = private unnamed_addr constant [15 x i8] c"conv2d_4_w.bin\00", align 1 -@.str.33 = private unnamed_addr constant [15 x i8] c"conv2d_4_b.bin\00", align 1 -@.str.34 = private unnamed_addr constant [15 x i8] c"conv2d_5_w.bin\00", align 1 -@.str.35 = private unnamed_addr constant [15 x i8] c"conv2d_5_b.bin\00", align 1 -@.str.36 = private unnamed_addr constant [15 x i8] c"conv2d_6_w.bin\00", align 1 -@.str.37 = private unnamed_addr constant [15 x i8] c"conv2d_6_b.bin\00", align 1 -@.str.38 = private unnamed_addr constant [15 x i8] c"conv2d_7_w.bin\00", align 1 -@.str.39 = private unnamed_addr constant [15 x i8] c"conv2d_7_b.bin\00", align 1 -@.str.40 = private unnamed_addr constant [15 x i8] c"conv2d_8_w.bin\00", align 1 -@.str.41 = private unnamed_addr constant [15 x i8] c"conv2d_8_b.bin\00", align 1 -@.str.42 = private unnamed_addr constant [15 x i8] c"conv2d_9_w.bin\00", align 1 -@.str.43 = private unnamed_addr constant [15 x i8] c"conv2d_9_b.bin\00", align 1 -@.str.44 = private unnamed_addr constant [16 x i8] c"conv2d_10_w.bin\00", align 1 -@.str.45 = private unnamed_addr constant [16 x i8] c"conv2d_10_b.bin\00", align 1 -@.str.46 = private unnamed_addr constant [16 x i8] c"conv2d_11_w.bin\00", align 1 -@.str.47 = private unnamed_addr constant [16 x i8] c"conv2d_11_b.bin\00", align 1 -@.str.48 = private unnamed_addr constant [16 x i8] c"conv2d_12_w.bin\00", align 1 -@.str.49 = private unnamed_addr constant [16 x i8] c"conv2d_12_b.bin\00", align 1 -@.str.50 = private unnamed_addr constant [16 x i8] c"conv2d_13_w.bin\00", align 1 -@.str.51 = private unnamed_addr constant [16 x i8] c"conv2d_13_b.bin\00", align 1 -@.str.52 = private unnamed_addr constant [14 x i8] c"dense_1_w.bin\00", align 1 -@.str.53 = private unnamed_addr constant [14 x i8] c"dense_1_b.bin\00", align 1 -@.str.54 = private unnamed_addr constant [14 x i8] c"dense_2_w.bin\00", align 1 -@.str.55 = private unnamed_addr constant [14 x i8] c"dense_2_b.bin\00", align 1 +$_ZNSt6vectorIfSaIfEED2Ev = comdat any + +$_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_ = comdat any + +$_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_ = comdat any + +@_ZStL8__ioinit = internal global %"class.std::ios_base::Init" zeroinitializer, align 1 +@__dso_handle = external global i8 +@run_accuracies = global %"class.std::vector" zeroinitializer, align 8 +@.str.2 = private unnamed_addr constant [19 x i8] c"tensor dims = %d \0A\00", align 1 +@.str.3 = private unnamed_addr constant [18 x i8] c"dim1_size = %lu \0A\00", align 1 +@.str.4 = private unnamed_addr constant [18 x i8] c"dim2_size = %lu \0A\00", align 1 +@.str.5 = private unnamed_addr constant [18 x i8] c"num_elems = %lu \0A\00", align 1 +@.str.6 = private unnamed_addr constant [3 x i8] c"wb\00", align 1 +@.str.7 = private unnamed_addr constant [58 x i8] c"File %s could not be created. Check if directory exists \0A\00", align 1 +@.str.8 = private unnamed_addr constant [4 x i8] c"%f,\00", align 1 +@.str.10 = private unnamed_addr constant [18 x i8] c"Num_elems = %lu \0A\00", align 1 +@.str.11 = private unnamed_addr constant [16 x i8] c"dim[%d] = %lu \0A\00", align 1 +@.str.12 = private unnamed_addr constant [35 x i8] c"Tensor data mismatch at index %d \0A\00", align 1 +@.str.13 = private unnamed_addr constant [21 x i8] c"Tensor data mismatch\00", align 1 +@.str.14 = private unnamed_addr constant [3 x i8] c"rb\00", align 1 +@.str.15 = private unnamed_addr constant [41 x i8] c"Data file %s is not found. Aborting... \0A\00", align 1 +@.str.16 = private unnamed_addr constant [40 x i8] c"size in bytes = %lu, bytes read = %lu \0A\00", align 1 +@.str.17 = private unnamed_addr constant [23 x i8] c"size_in_bytes = %lu \0A\00", align 1 +@.str.18 = private unnamed_addr constant [31 x i8] c"******NOTE: tensor Dims = %d \0A\00", align 1 +@.str.20 = private unnamed_addr constant [40 x i8] c"Data file %s is not found. Aborting...\0A\00", align 1 +@.str.21 = private unnamed_addr constant [24 x i8] c"****** Accuracy = %f \0A\0A\00", align 1 +@.str.22 = private unnamed_addr constant [15 x i8] c"final_accuracy\00", align 1 +@.str.23 = private unnamed_addr constant [3 x i8] c"w+\00", align 1 +@.str.24 = private unnamed_addr constant [34 x i8] c"batch_dim = %lu, channels = %lu \0A\00", align 1 +@.str.25 = private unnamed_addr constant [37 x i8] c"batch_dim = %lu, num_classes = %lu \0A\00", align 1 +@.str.26 = private unnamed_addr constant [30 x i8] c"\0A\0A **** Final Accuracy = %f \0A\00", align 1 +@.str.27 = private unnamed_addr constant [9 x i8] c"avg_psnr\00", align 1 +@.str.28 = private unnamed_addr constant [13 x i8] c"psnr_std.txt\00", align 1 +@.str.29 = private unnamed_addr constant [19 x i8] c"run_accuracies.txt\00", align 1 +@.str.30 = private unnamed_addr constant [2 x i8] c"r\00", align 1 +@.str.32 = private unnamed_addr constant [3 x i8] c"%f\00", align 1 +@.str.33 = private unnamed_addr constant [23 x i8] c"**** PSNR read = %f \0A\0A\00", align 1 +@.str.34 = private unnamed_addr constant [9 x i8] c"psnr.txt\00", align 1 +@.str.35 = private unnamed_addr constant [36 x i8] c"batch_dim = %lu, image_size = %lu \0A\00", align 1 +@.str.36 = private unnamed_addr constant [13 x i8] c"img_psnr.txt\00", align 1 +@.str.37 = private unnamed_addr constant [18 x i8] c"PSNR value = %f \0A\00", align 1 +@.str.38 = private unnamed_addr constant [26 x i8] c"*** violation_rate= %f \0A\0A\00", align 1 +@.str.39 = private unnamed_addr constant [22 x i8] c"*** avg_psnr = %f \0A\0A\00", align 1 +@.str.40 = private unnamed_addr constant [23 x i8] c"** Output size = %lu \0A\00", align 1 +@.str.41 = private unnamed_addr constant [70 x i8] c"../../../../../../projects/hpvm-tensor-rt/model_params/vgg16_cifar10/\00", align 1 +@.str.42 = private unnamed_addr constant [10 x i8] c"input.bin\00", align 1 +@.str.43 = private unnamed_addr constant [13 x i8] c"labels32.bin\00", align 1 +@.str.44 = private unnamed_addr constant [15 x i8] c"conv2d_1_w.bin\00", align 1 +@.str.45 = private unnamed_addr constant [15 x i8] c"conv2d_1_b.bin\00", align 1 +@.str.46 = private unnamed_addr constant [15 x i8] c"conv2d_2_w.bin\00", align 1 +@.str.47 = private unnamed_addr constant [15 x i8] c"conv2d_2_b.bin\00", align 1 +@.str.48 = private unnamed_addr constant [15 x i8] c"conv2d_3_w.bin\00", align 1 +@.str.49 = private unnamed_addr constant [15 x i8] c"conv2d_3_b.bin\00", align 1 +@.str.50 = private unnamed_addr constant [15 x i8] c"conv2d_4_w.bin\00", align 1 +@.str.51 = private unnamed_addr constant [15 x i8] c"conv2d_4_b.bin\00", align 1 +@.str.52 = private unnamed_addr constant [15 x i8] c"conv2d_5_w.bin\00", align 1 +@.str.53 = private unnamed_addr constant [15 x i8] c"conv2d_5_b.bin\00", align 1 +@.str.54 = private unnamed_addr constant [15 x i8] c"conv2d_6_w.bin\00", align 1 +@.str.55 = private unnamed_addr constant [15 x i8] c"conv2d_6_b.bin\00", align 1 +@.str.56 = private unnamed_addr constant [15 x i8] c"conv2d_7_w.bin\00", align 1 +@.str.57 = private unnamed_addr constant [15 x i8] c"conv2d_7_b.bin\00", align 1 +@.str.58 = private unnamed_addr constant [15 x i8] c"conv2d_8_w.bin\00", align 1 +@.str.59 = private unnamed_addr constant [15 x i8] c"conv2d_8_b.bin\00", align 1 +@.str.60 = private unnamed_addr constant [15 x i8] c"conv2d_9_w.bin\00", align 1 +@.str.61 = private unnamed_addr constant [15 x i8] c"conv2d_9_b.bin\00", align 1 +@.str.62 = private unnamed_addr constant [16 x i8] c"conv2d_10_w.bin\00", align 1 +@.str.63 = private unnamed_addr constant [16 x i8] c"conv2d_10_b.bin\00", align 1 +@.str.64 = private unnamed_addr constant [16 x i8] c"conv2d_11_w.bin\00", align 1 +@.str.65 = private unnamed_addr constant [16 x i8] c"conv2d_11_b.bin\00", align 1 +@.str.66 = private unnamed_addr constant [16 x i8] c"conv2d_12_w.bin\00", align 1 +@.str.67 = private unnamed_addr constant [16 x i8] c"conv2d_12_b.bin\00", align 1 +@.str.68 = private unnamed_addr constant [16 x i8] c"conv2d_13_w.bin\00", align 1 +@.str.69 = private unnamed_addr constant [16 x i8] c"conv2d_13_b.bin\00", align 1 +@.str.70 = private unnamed_addr constant [14 x i8] c"dense_1_w.bin\00", align 1 +@.str.71 = private unnamed_addr constant [14 x i8] c"dense_1_b.bin\00", align 1 +@.str.72 = private unnamed_addr constant [14 x i8] c"dense_2_w.bin\00", align 1 +@.str.73 = private unnamed_addr constant [14 x i8] c"dense_2_b.bin\00", align 1 @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE = external unnamed_addr constant { [5 x i8*], [5 x i8*] } @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE = external unnamed_addr constant [4 x i8*] @_ZTVSt9basic_iosIcSt11char_traitsIcEE = external unnamed_addr constant { [4 x i8*] } @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE = external unnamed_addr constant { [16 x i8*] } @_ZTVSt15basic_streambufIcSt11char_traitsIcEE = external unnamed_addr constant { [16 x i8*] } +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @_GLOBAL__sub_I_vgg16_cifar10.cpp, i8* null }] @str = private unnamed_addr constant [23 x i8] c"Successful cudaMalloc \00" +@str.78 = private unnamed_addr constant [27 x i8] c"ERROR: NULL data pointers \00" +@str.79 = private unnamed_addr constant [28 x i8] c"ERROR: psnr.txt not found! \00" + +declare void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"*) unnamed_addr #0 + +; Function Attrs: nounwind +declare void @_ZNSt8ios_base4InitD1Ev(%"class.std::ios_base::Init"*) unnamed_addr #1 + +; Function Attrs: nounwind +declare i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #2 + +; Function Attrs: nounwind uwtable +define linkonce_odr void @_ZNSt6vectorIfSaIfEED2Ev(%"class.std::vector"* %this) unnamed_addr #3 comdat align 2 { +entry: + %_M_start.i = getelementptr inbounds %"class.std::vector", %"class.std::vector"* %this, i64 0, i32 0, i32 0, i32 0 + %0 = load float*, float** %_M_start.i, align 8, !tbaa !52 + %tobool.i.i = icmp eq float* %0, null + br i1 %tobool.i.i, label %_ZNSt12_Vector_baseIfSaIfEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %entry + %1 = bitcast float* %0 to i8* + tail call void @_ZdlPv(i8* %1) #2 + br label %_ZNSt12_Vector_baseIfSaIfEED2Ev.exit + +_ZNSt12_Vector_baseIfSaIfEED2Ev.exit: ; preds = %if.then.i.i, %entry + ret void +} ; Function Attrs: nounwind uwtable -define void @_Z15printTensorInfoPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #0 { +define void @_Z15printTensorInfoPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #3 { entry: - %gpu_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 40 + %gpu_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 56 %0 = bitcast i8* %gpu_data to i8** - %1 = load i8*, i8** %0, align 8, !tbaa !52 + %1 = load i8*, i8** %0, align 8, !tbaa !58 %cmp = icmp eq i8* %1, null br i1 %cmp, label %if.end, label %if.then @@ -157,93 +214,90 @@ if.then: ; preds = %entry br label %if.end if.end: ; preds = %if.then, %entry - %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 64 + %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 88 %num_dims = bitcast i8* %dims to i32* - %2 = load i32, i32* %num_dims, align 8, !tbaa !61 - %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.1, i64 0, i64 0), i32 %2) - %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 + %2 = load i32, i32* %num_dims, align 8, !tbaa !64 + %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.2, i64 0, i64 0), i32 %2) + %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 96 %3 = bitcast i8* %dim_sizes to i64** - %4 = load i64*, i64** %3, align 8, !tbaa !62 - %5 = load i64, i64* %4, align 8, !tbaa !63 - %call3 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.2, i64 0, i64 0), i64 %5) - %6 = load i64*, i64** %3, align 8, !tbaa !62 + %4 = load i64*, i64** %3, align 8, !tbaa !65 + %5 = load i64, i64* %4, align 8, !tbaa !66 + %call3 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.3, i64 0, i64 0), i64 %5) + %6 = load i64*, i64** %3, align 8, !tbaa !65 %arrayidx6 = getelementptr inbounds i64, i64* %6, i64 1 - %7 = load i64, i64* %arrayidx6, align 8, !tbaa !63 - %call7 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.3, i64 0, i64 0), i64 %7) - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %7 = load i64, i64* %arrayidx6, align 8, !tbaa !66 + %call7 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.4, i64 0, i64 0), i64 %7) + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %8 = bitcast i8* %num_elems to i64* - %9 = load i64, i64* %8, align 8, !tbaa !64 - %call8 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.4, i64 0, i64 0), i64 %9) + %9 = load i64, i64* %8, align 8, !tbaa !67 + %call8 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.5, i64 0, i64 0), i64 %9) ret void } ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start(i64, i8* nocapture) #1 +declare void @llvm.lifetime.start(i64, i8* nocapture) #4 ; Function Attrs: nounwind -declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #2 +declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #1 ; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end(i64, i8* nocapture) #1 +declare void @llvm.lifetime.end(i64, i8* nocapture) #4 ; Function Attrs: nounwind uwtable -define void @_Z17dumpWeightsToFilePcPv(i8* %file_name, i8* %weights_ptr) local_unnamed_addr #0 { +define void @_Z17dumpWeightsToFilePcPv(i8* %file_name, i8* %weights_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %weights_ptr, i32 0) #7 - %call = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.5, i64 0, i64 0)) + tail call void @hpvm_request_tensor(i8* %weights_ptr, i32 0) #2 + %call = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.6, i64 0, i64 0)) %cmp = icmp eq %struct._IO_FILE* %call, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([58 x i8], [58 x i8]* @.str.6, i64 0, i64 0), i8* %file_name) - tail call void @abort() #8 + %call1 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([58 x i8], [58 x i8]* @.str.7, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable if.end: ; preds = %entry - %size_in_bytes = getelementptr inbounds i8, i8* %weights_ptr, i64 56 - %0 = bitcast i8* %size_in_bytes to i64* - %1 = load i64, i64* %0, align 8, !tbaa !65 - %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.7, i64 0, i64 0), i64 %1) - %host_data = getelementptr inbounds i8, i8* %weights_ptr, i64 32 - %2 = bitcast i8* %host_data to i8** - %3 = load i8*, i8** %2, align 8, !tbaa !66 - %4 = load i64, i64* %0, align 8, !tbaa !65 - %call4 = tail call i64 @fwrite(i8* %3, i64 1, i64 %4, %struct._IO_FILE* nonnull %call) - %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.8, i64 0, i64 0), i64 %call4) - %call6 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call) + %host_data = getelementptr inbounds i8, i8* %weights_ptr, i64 48 + %0 = bitcast i8* %host_data to i8** + %1 = load i8*, i8** %0, align 8, !tbaa !68 + %size_in_bytes = getelementptr inbounds i8, i8* %weights_ptr, i64 80 + %2 = bitcast i8* %size_in_bytes to i64* + %3 = load i64, i64* %2, align 8, !tbaa !69 + %call2 = tail call i64 @fwrite(i8* %1, i64 1, i64 %3, %struct._IO_FILE* nonnull %call) + %call3 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call) ret void } -declare void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #3 +declare void @hpvm_request_tensor(i8*, i32) local_unnamed_addr #0 ; Function Attrs: nounwind -declare noalias %struct._IO_FILE* @fopen(i8* nocapture readonly, i8* nocapture readonly) local_unnamed_addr #2 +declare noalias %struct._IO_FILE* @fopen(i8* nocapture readonly, i8* nocapture readonly) local_unnamed_addr #1 ; Function Attrs: noreturn nounwind -declare void @abort() local_unnamed_addr #4 +declare void @abort() local_unnamed_addr #5 ; Function Attrs: nounwind -declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #2 +declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #1 ; Function Attrs: nounwind -declare i32 @fclose(%struct._IO_FILE* nocapture) local_unnamed_addr #2 +declare i32 @fclose(%struct._IO_FILE* nocapture) local_unnamed_addr #1 ; Function Attrs: nounwind uwtable -define void @_Z18fillTensorWithOnesPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z18fillTensorWithOnesPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !67 + %0 = load i32, i32* %data_type, align 8, !tbaa !70 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !66 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !68 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !64 + %4 = load i64, i64* %3, align 8, !tbaa !67 %cmp110 = icmp eq i64 %4, 0 br i1 %cmp110, label %if.end, label %for.body.preheader @@ -281,14 +335,14 @@ vector.body.prol: ; preds = %vector.body.prol, % %prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ] %13 = getelementptr inbounds float, float* %2, i64 %index.prol %14 = bitcast float* %13 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !71 %15 = getelementptr float, float* %13, i64 4 %16 = bitcast float* %15 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !71 %index.next.prol = add i64 %index.prol, 8 %prol.iter.sub = add i64 %prol.iter, -1 %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 - br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !70 + br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !73 vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol br label %vector.body.prol.loopexit @@ -305,62 +359,62 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.7, %vector.body ] %18 = getelementptr inbounds float, float* %2, i64 %index %19 = bitcast float* %18 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !71 %20 = getelementptr float, float* %18, i64 4 %21 = bitcast float* %20 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !71 %index.next = add i64 %index, 8 %22 = getelementptr inbounds float, float* %2, i64 %index.next %23 = bitcast float* %22 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !71 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !71 %index.next.1 = add i64 %index, 16 %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !71 %28 = getelementptr float, float* %26, i64 4 %29 = bitcast float* %28 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !71 %index.next.2 = add i64 %index, 24 %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 %31 = bitcast float* %30 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !71 %32 = getelementptr float, float* %30, i64 4 %33 = bitcast float* %32 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !71 %index.next.3 = add i64 %index, 32 %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 %35 = bitcast float* %34 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !71 %36 = getelementptr float, float* %34, i64 4 %37 = bitcast float* %36 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !71 %index.next.4 = add i64 %index, 40 %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !71 %40 = getelementptr float, float* %38, i64 4 %41 = bitcast float* %40 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !71 %index.next.5 = add i64 %index, 48 %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 %43 = bitcast float* %42 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !71 %44 = getelementptr float, float* %42, i64 4 %45 = bitcast float* %44 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !71 %index.next.6 = add i64 %index, 56 %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 %47 = bitcast float* %46 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !71 %48 = getelementptr float, float* %46, i64 4 %49 = bitcast float* %48 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !71 %index.next.7 = add i64 %index, 64 %50 = icmp eq i64 %index.next.7, %n.vec - br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !72 + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !75 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -378,11 +432,11 @@ for.body: ; preds = %for.body, %for.body %conv12 = phi i64 [ %conv, %for.body ], [ %conv12.ph, %for.body.preheader22 ] %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader22 ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv12 - store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !68 + store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !71 %inc = add i32 %i.011, 1 %conv = zext i32 %inc to i64 %cmp1 = icmp ult i64 %conv, %4 - br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !75 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !78 if.end.loopexit: ; preds = %for.body br label %if.end @@ -392,21 +446,21 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: nounwind uwtable -define void @_Z19fillWithOnesAndTwosPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z19fillWithOnesAndTwosPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !67 + %0 = load i32, i32* %data_type, align 8, !tbaa !70 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !66 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !68 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !64 + %4 = load i64, i64* %3, align 8, !tbaa !67 %div35 = lshr i64 %4, 1 %cmp136 = icmp eq i64 %div35, 0 br i1 %cmp136, label %for.cond.cleanup, label %for.body.preheader @@ -450,14 +504,14 @@ vector.body.prol: ; preds = %vector.body.prol, % %prol.iter88 = phi i64 [ %prol.iter88.sub, %vector.body.prol ], [ %xtraiter86, %vector.body.prol.preheader ] %13 = getelementptr inbounds float, float* %2, i64 %index.prol %14 = bitcast float* %13 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %14, align 4, !tbaa !71 %15 = getelementptr float, float* %13, i64 4 %16 = bitcast float* %15 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %16, align 4, !tbaa !71 %index.next.prol = add i64 %index.prol, 8 %prol.iter88.sub = add i64 %prol.iter88, -1 %prol.iter88.cmp = icmp eq i64 %prol.iter88.sub, 0 - br i1 %prol.iter88.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !76 + br i1 %prol.iter88.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !79 vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol br label %vector.body.prol.loopexit @@ -474,62 +528,62 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.7, %vector.body ] %18 = getelementptr inbounds float, float* %2, i64 %index %19 = bitcast float* %18 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %19, align 4, !tbaa !71 %20 = getelementptr float, float* %18, i64 4 %21 = bitcast float* %20 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %21, align 4, !tbaa !71 %index.next = add i64 %index, 8 %22 = getelementptr inbounds float, float* %2, i64 %index.next %23 = bitcast float* %22 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %23, align 4, !tbaa !71 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %25, align 4, !tbaa !71 %index.next.1 = add i64 %index, 16 %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %27, align 4, !tbaa !71 %28 = getelementptr float, float* %26, i64 4 %29 = bitcast float* %28 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %29, align 4, !tbaa !71 %index.next.2 = add i64 %index, 24 %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 %31 = bitcast float* %30 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %31, align 4, !tbaa !71 %32 = getelementptr float, float* %30, i64 4 %33 = bitcast float* %32 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %33, align 4, !tbaa !71 %index.next.3 = add i64 %index, 32 %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 %35 = bitcast float* %34 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %35, align 4, !tbaa !71 %36 = getelementptr float, float* %34, i64 4 %37 = bitcast float* %36 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %37, align 4, !tbaa !71 %index.next.4 = add i64 %index, 40 %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %39, align 4, !tbaa !71 %40 = getelementptr float, float* %38, i64 4 %41 = bitcast float* %40 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %41, align 4, !tbaa !71 %index.next.5 = add i64 %index, 48 %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 %43 = bitcast float* %42 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %43, align 4, !tbaa !71 %44 = getelementptr float, float* %42, i64 4 %45 = bitcast float* %44 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %45, align 4, !tbaa !71 %index.next.6 = add i64 %index, 56 %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 %47 = bitcast float* %46 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %47, align 4, !tbaa !71 %48 = getelementptr float, float* %46, i64 4 %49 = bitcast float* %48 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float>* %49, align 4, !tbaa !71 %index.next.7 = add i64 %index, 64 %50 = icmp eq i64 %index.next.7, %n.vec - br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !77 + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !80 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -607,14 +661,14 @@ vector.body49.prol: ; preds = %vector.body49.prol, %74 = add i64 %conv731, %index67.prol %75 = getelementptr inbounds float, float* %2, i64 %74 %76 = bitcast float* %75 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %76, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %76, align 4, !tbaa !71 %77 = getelementptr float, float* %75, i64 4 %78 = bitcast float* %77 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %78, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %78, align 4, !tbaa !71 %index.next68.prol = add i64 %index67.prol, 8 %prol.iter.sub = add i64 %prol.iter, -1 %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 - br i1 %prol.iter.cmp, label %vector.body49.prol.loopexit.unr-lcssa, label %vector.body49.prol, !llvm.loop !78 + br i1 %prol.iter.cmp, label %vector.body49.prol.loopexit.unr-lcssa, label %vector.body49.prol, !llvm.loop !81 vector.body49.prol.loopexit.unr-lcssa: ; preds = %vector.body49.prol br label %vector.body49.prol.loopexit @@ -632,37 +686,37 @@ vector.body49: ; preds = %vector.body49, %vec %80 = add i64 %conv731, %index67 %81 = getelementptr inbounds float, float* %2, i64 %80 %82 = bitcast float* %81 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %82, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %82, align 4, !tbaa !71 %83 = getelementptr float, float* %81, i64 4 %84 = bitcast float* %83 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %84, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %84, align 4, !tbaa !71 %index.next68 = add i64 %index67, 8 %85 = add i64 %conv731, %index.next68 %86 = getelementptr inbounds float, float* %2, i64 %85 %87 = bitcast float* %86 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %87, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %87, align 4, !tbaa !71 %88 = getelementptr float, float* %86, i64 4 %89 = bitcast float* %88 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %89, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %89, align 4, !tbaa !71 %index.next68.1 = add i64 %index67, 16 %90 = add i64 %conv731, %index.next68.1 %91 = getelementptr inbounds float, float* %2, i64 %90 %92 = bitcast float* %91 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %92, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %92, align 4, !tbaa !71 %93 = getelementptr float, float* %91, i64 4 %94 = bitcast float* %93 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %94, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %94, align 4, !tbaa !71 %index.next68.2 = add i64 %index67, 24 %95 = add i64 %conv731, %index.next68.2 %96 = getelementptr inbounds float, float* %2, i64 %95 %97 = bitcast float* %96 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %97, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %97, align 4, !tbaa !71 %98 = getelementptr float, float* %96, i64 4 %99 = bitcast float* %98 to <4 x float>* - store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %99, align 4, !tbaa !68 + store <4 x float> <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>, <4 x float>* %99, align 4, !tbaa !71 %index.next68.3 = add i64 %index67, 32 %100 = icmp eq i64 %index.next68.3, %n.vec55 - br i1 %100, label %middle.block50.unr-lcssa, label %vector.body49, !llvm.loop !79 + br i1 %100, label %middle.block50.unr-lcssa, label %vector.body49, !llvm.loop !82 middle.block50.unr-lcssa: ; preds = %vector.body49 br label %middle.block50 @@ -675,21 +729,21 @@ for.body: ; preds = %for.body, %for.body %conv38 = phi i64 [ %conv, %for.body ], [ %conv38.ph, %for.body.preheader85 ] %i.037 = phi i32 [ %inc, %for.body ], [ %i.037.ph, %for.body.preheader85 ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv38 - store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !68 + store float 1.000000e+00, float* %arrayidx, align 4, !tbaa !71 %inc = add i32 %i.037, 1 %conv = zext i32 %inc to i64 %cmp1 = icmp ult i64 %conv, %div35 - br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !80 + br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !83 for.body11: ; preds = %for.body11, %for.body11.preheader %conv734 = phi i64 [ %conv7, %for.body11 ], [ %conv734.ph, %for.body11.preheader ] %i2.033 = phi i32 [ %inc15, %for.body11 ], [ %i2.033.ph, %for.body11.preheader ] %arrayidx13 = getelementptr inbounds float, float* %2, i64 %conv734 - store float 2.000000e+00, float* %arrayidx13, align 4, !tbaa !68 + store float 2.000000e+00, float* %arrayidx13, align 4, !tbaa !71 %inc15 = add i32 %i2.033, 1 %conv7 = zext i32 %inc15 to i64 %cmp9 = icmp ult i64 %conv7, %4 - br i1 %cmp9, label %for.body11, label %if.end.loopexit, !llvm.loop !81 + br i1 %cmp9, label %for.body11, label %if.end.loopexit, !llvm.loop !84 if.end.loopexit: ; preds = %for.body11 br label %if.end @@ -699,21 +753,186 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: nounwind uwtable -define void @_Z21fillTensorWithNegOnesPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z17fillTensorWithValPvf(i8* %tensor_ptr, float %target_value) local_unnamed_addr #3 { +entry: + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 + %data_type = bitcast i8* %tensor_ptr to i32* + %0 = load i32, i32* %data_type, align 8, !tbaa !70 + %cmp = icmp eq i32 %0, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %1 = bitcast i8* %host_data to float** + %2 = load float*, float** %1, align 8, !tbaa !68 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 + %3 = bitcast i8* %num_elems to i64* + %4 = load i64, i64* %3, align 8, !tbaa !67 + %cmp110 = icmp eq i64 %4, 0 + br i1 %cmp110, label %if.end, label %for.body.preheader + +for.body.preheader: ; preds = %if.then + %min.iters.check = icmp ult i64 %4, 8 + br i1 %min.iters.check, label %for.body.preheader24, label %min.iters.checked + +min.iters.checked: ; preds = %for.body.preheader + %n.vec = and i64 %4, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + br i1 %cmp.zero, label %for.body.preheader24, label %vector.scevcheck + +vector.scevcheck: ; preds = %min.iters.checked + %5 = add i64 %4, -1 + %6 = trunc i64 %5 to i32 + %7 = icmp eq i32 %6, -1 + %8 = icmp ugt i64 %5, 4294967295 + %9 = or i1 %7, %8 + %cast.crd = trunc i64 %n.vec to i32 + br i1 %9, label %for.body.preheader24, label %vector.ph + +vector.ph: ; preds = %vector.scevcheck + %broadcast.splatinsert22 = insertelement <4 x float> undef, float %target_value, i32 0 + %broadcast.splat23 = shufflevector <4 x float> %broadcast.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer + %10 = add i64 %n.vec, -8 + %11 = lshr exact i64 %10, 3 + %12 = add nuw nsw i64 %11, 1 + %xtraiter = and i64 %12, 7 + %lcmp.mod = icmp eq i64 %xtraiter, 0 + br i1 %lcmp.mod, label %vector.body.prol.loopexit, label %vector.body.prol.preheader + +vector.body.prol.preheader: ; preds = %vector.ph + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol, %vector.body.prol.preheader + %index.prol = phi i64 [ 0, %vector.body.prol.preheader ], [ %index.next.prol, %vector.body.prol ] + %prol.iter = phi i64 [ %xtraiter, %vector.body.prol.preheader ], [ %prol.iter.sub, %vector.body.prol ] + %13 = getelementptr inbounds float, float* %2, i64 %index.prol + %14 = bitcast float* %13 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %14, align 4, !tbaa !71 + %15 = getelementptr float, float* %13, i64 4 + %16 = bitcast float* %15 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %16, align 4, !tbaa !71 + %index.next.prol = add i64 %index.prol, 8 + %prol.iter.sub = add i64 %prol.iter, -1 + %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 + br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !85 + +vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.body.prol.loopexit.unr-lcssa, %vector.ph + %index.unr = phi i64 [ 0, %vector.ph ], [ %index.next.prol, %vector.body.prol.loopexit.unr-lcssa ] + %17 = icmp ult i64 %10, 56 + br i1 %17, label %middle.block, label %vector.ph.new + +vector.ph.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph.new + %index = phi i64 [ %index.unr, %vector.ph.new ], [ %index.next.7, %vector.body ] + %18 = getelementptr inbounds float, float* %2, i64 %index + %19 = bitcast float* %18 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %19, align 4, !tbaa !71 + %20 = getelementptr float, float* %18, i64 4 + %21 = bitcast float* %20 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %21, align 4, !tbaa !71 + %index.next = add i64 %index, 8 + %22 = getelementptr inbounds float, float* %2, i64 %index.next + %23 = bitcast float* %22 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %23, align 4, !tbaa !71 + %24 = getelementptr float, float* %22, i64 4 + %25 = bitcast float* %24 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %25, align 4, !tbaa !71 + %index.next.1 = add i64 %index, 16 + %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 + %27 = bitcast float* %26 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %27, align 4, !tbaa !71 + %28 = getelementptr float, float* %26, i64 4 + %29 = bitcast float* %28 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %29, align 4, !tbaa !71 + %index.next.2 = add i64 %index, 24 + %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 + %31 = bitcast float* %30 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %31, align 4, !tbaa !71 + %32 = getelementptr float, float* %30, i64 4 + %33 = bitcast float* %32 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %33, align 4, !tbaa !71 + %index.next.3 = add i64 %index, 32 + %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 + %35 = bitcast float* %34 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %35, align 4, !tbaa !71 + %36 = getelementptr float, float* %34, i64 4 + %37 = bitcast float* %36 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %37, align 4, !tbaa !71 + %index.next.4 = add i64 %index, 40 + %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 + %39 = bitcast float* %38 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %39, align 4, !tbaa !71 + %40 = getelementptr float, float* %38, i64 4 + %41 = bitcast float* %40 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %41, align 4, !tbaa !71 + %index.next.5 = add i64 %index, 48 + %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 + %43 = bitcast float* %42 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %43, align 4, !tbaa !71 + %44 = getelementptr float, float* %42, i64 4 + %45 = bitcast float* %44 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %45, align 4, !tbaa !71 + %index.next.6 = add i64 %index, 56 + %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 + %47 = bitcast float* %46 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %47, align 4, !tbaa !71 + %48 = getelementptr float, float* %46, i64 4 + %49 = bitcast float* %48 to <4 x float>* + store <4 x float> %broadcast.splat23, <4 x float>* %49, align 4, !tbaa !71 + %index.next.7 = add i64 %index, 64 + %50 = icmp eq i64 %index.next.7, %n.vec + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !86 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %middle.block.unr-lcssa, %vector.body.prol.loopexit + %cmp.n = icmp eq i64 %4, %n.vec + br i1 %cmp.n, label %if.end, label %for.body.preheader24 + +for.body.preheader24: ; preds = %middle.block, %vector.scevcheck, %min.iters.checked, %for.body.preheader + %conv12.ph = phi i64 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %i.011.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %cast.crd, %middle.block ] + br label %for.body + +for.body: ; preds = %for.body, %for.body.preheader24 + %conv12 = phi i64 [ %conv, %for.body ], [ %conv12.ph, %for.body.preheader24 ] + %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader24 ] + %arrayidx = getelementptr inbounds float, float* %2, i64 %conv12 + store float %target_value, float* %arrayidx, align 4, !tbaa !71 + %inc = add i32 %i.011, 1 + %conv = zext i32 %inc to i64 + %cmp1 = icmp ult i64 %conv, %4 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !87 + +if.end.loopexit: ; preds = %for.body + br label %if.end + +if.end: ; preds = %if.end.loopexit, %middle.block, %if.then, %entry + ret void +} + +; Function Attrs: nounwind uwtable +define void @_Z21fillTensorWithNegOnesPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !67 + %0 = load i32, i32* %data_type, align 8, !tbaa !70 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !66 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !68 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !64 + %4 = load i64, i64* %3, align 8, !tbaa !67 %cmp110 = icmp eq i64 %4, 0 br i1 %cmp110, label %if.end, label %for.body.preheader @@ -751,14 +970,14 @@ vector.body.prol: ; preds = %vector.body.prol, % %prol.iter = phi i64 [ %prol.iter.sub, %vector.body.prol ], [ %xtraiter, %vector.body.prol.preheader ] %13 = getelementptr inbounds float, float* %2, i64 %index.prol %14 = bitcast float* %13 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %14, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %14, align 4, !tbaa !71 %15 = getelementptr float, float* %13, i64 4 %16 = bitcast float* %15 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %16, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %16, align 4, !tbaa !71 %index.next.prol = add i64 %index.prol, 8 %prol.iter.sub = add i64 %prol.iter, -1 %prol.iter.cmp = icmp eq i64 %prol.iter.sub, 0 - br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !82 + br i1 %prol.iter.cmp, label %vector.body.prol.loopexit.unr-lcssa, label %vector.body.prol, !llvm.loop !88 vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol br label %vector.body.prol.loopexit @@ -775,62 +994,62 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.7, %vector.body ] %18 = getelementptr inbounds float, float* %2, i64 %index %19 = bitcast float* %18 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %19, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %19, align 4, !tbaa !71 %20 = getelementptr float, float* %18, i64 4 %21 = bitcast float* %20 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %21, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %21, align 4, !tbaa !71 %index.next = add i64 %index, 8 %22 = getelementptr inbounds float, float* %2, i64 %index.next %23 = bitcast float* %22 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %23, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %23, align 4, !tbaa !71 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %25, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %25, align 4, !tbaa !71 %index.next.1 = add i64 %index, 16 %26 = getelementptr inbounds float, float* %2, i64 %index.next.1 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %27, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %27, align 4, !tbaa !71 %28 = getelementptr float, float* %26, i64 4 %29 = bitcast float* %28 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %29, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %29, align 4, !tbaa !71 %index.next.2 = add i64 %index, 24 %30 = getelementptr inbounds float, float* %2, i64 %index.next.2 %31 = bitcast float* %30 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %31, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %31, align 4, !tbaa !71 %32 = getelementptr float, float* %30, i64 4 %33 = bitcast float* %32 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %33, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %33, align 4, !tbaa !71 %index.next.3 = add i64 %index, 32 %34 = getelementptr inbounds float, float* %2, i64 %index.next.3 %35 = bitcast float* %34 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %35, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %35, align 4, !tbaa !71 %36 = getelementptr float, float* %34, i64 4 %37 = bitcast float* %36 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %37, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %37, align 4, !tbaa !71 %index.next.4 = add i64 %index, 40 %38 = getelementptr inbounds float, float* %2, i64 %index.next.4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %39, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %39, align 4, !tbaa !71 %40 = getelementptr float, float* %38, i64 4 %41 = bitcast float* %40 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %41, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %41, align 4, !tbaa !71 %index.next.5 = add i64 %index, 48 %42 = getelementptr inbounds float, float* %2, i64 %index.next.5 %43 = bitcast float* %42 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %43, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %43, align 4, !tbaa !71 %44 = getelementptr float, float* %42, i64 4 %45 = bitcast float* %44 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %45, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %45, align 4, !tbaa !71 %index.next.6 = add i64 %index, 56 %46 = getelementptr inbounds float, float* %2, i64 %index.next.6 %47 = bitcast float* %46 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %47, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %47, align 4, !tbaa !71 %48 = getelementptr float, float* %46, i64 4 %49 = bitcast float* %48 to <4 x float>* - store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %49, align 4, !tbaa !68 + store <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, <4 x float>* %49, align 4, !tbaa !71 %index.next.7 = add i64 %index, 64 %50 = icmp eq i64 %index.next.7, %n.vec - br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !83 + br i1 %50, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !89 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -848,11 +1067,11 @@ for.body: ; preds = %for.body, %for.body %conv12 = phi i64 [ %conv, %for.body ], [ %conv12.ph, %for.body.preheader22 ] %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader22 ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv12 - store float -1.000000e+00, float* %arrayidx, align 4, !tbaa !68 + store float -1.000000e+00, float* %arrayidx, align 4, !tbaa !71 %inc = add i32 %i.011, 1 %conv = zext i32 %inc to i64 %cmp1 = icmp ult i64 %conv, %4 - br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !84 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !90 if.end.loopexit: ; preds = %for.body br label %if.end @@ -862,20 +1081,20 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: norecurse nounwind uwtable -define void @_Z14fillTensorValsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #5 { +define void @_Z14fillTensorValsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #6 { entry: %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !67 + %0 = load i32, i32* %data_type, align 8, !tbaa !70 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !66 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !68 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !64 + %4 = load i64, i64* %3, align 8, !tbaa !67 %cmp111 = icmp eq i64 %4, 0 br i1 %cmp111, label %if.end, label %for.body.preheader @@ -909,10 +1128,10 @@ vector.body.prol.preheader: ; preds = %vector.body.prehead vector.body.prol: ; preds = %vector.body.prol.preheader %13 = bitcast float* %2 to <4 x float>* - store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float>* %13, align 4, !tbaa !68 + store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float>* %13, align 4, !tbaa !71 %14 = getelementptr float, float* %2, i64 4 %15 = bitcast float* %14 to <4 x float>* - store <4 x float> <float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>, <4 x float>* %15, align 4, !tbaa !68 + store <4 x float> <float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>, <4 x float>* %15, align 4, !tbaa !71 br label %vector.body.prol.loopexit vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader @@ -934,10 +1153,10 @@ vector.body: ; preds = %vector.body, %vecto %21 = uitofp <4 x i32> %19 to <4 x float> %22 = getelementptr inbounds float, float* %2, i64 %index %23 = bitcast float* %22 to <4 x float>* - store <4 x float> %20, <4 x float>* %23, align 4, !tbaa !68 + store <4 x float> %20, <4 x float>* %23, align 4, !tbaa !71 %24 = getelementptr float, float* %22, i64 4 %25 = bitcast float* %24 to <4 x float>* - store <4 x float> %21, <4 x float>* %25, align 4, !tbaa !68 + store <4 x float> %21, <4 x float>* %25, align 4, !tbaa !71 %index.next = add i64 %index, 8 %26 = trunc i64 %index.next to i32 %broadcast.splatinsert19.1 = insertelement <4 x i32> undef, i32 %26, i32 0 @@ -948,13 +1167,13 @@ vector.body: ; preds = %vector.body, %vecto %30 = uitofp <4 x i32> %28 to <4 x float> %31 = getelementptr inbounds float, float* %2, i64 %index.next %32 = bitcast float* %31 to <4 x float>* - store <4 x float> %29, <4 x float>* %32, align 4, !tbaa !68 + store <4 x float> %29, <4 x float>* %32, align 4, !tbaa !71 %33 = getelementptr float, float* %31, i64 4 %34 = bitcast float* %33 to <4 x float>* - store <4 x float> %30, <4 x float>* %34, align 4, !tbaa !68 + store <4 x float> %30, <4 x float>* %34, align 4, !tbaa !71 %index.next.1 = add i64 %index, 16 %35 = icmp eq i64 %index.next.1, %n.vec - br i1 %35, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !85 + br i1 %35, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !91 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block @@ -974,10 +1193,10 @@ for.body: ; preds = %for.body, %for.body %add = add i32 %i.012, 1 %conv2 = uitofp i32 %add to float %arrayidx = getelementptr inbounds float, float* %2, i64 %conv13 - store float %conv2, float* %arrayidx, align 4, !tbaa !68 + store float %conv2, float* %arrayidx, align 4, !tbaa !71 %conv = zext i32 %add to i64 %cmp1 = icmp ult i64 %conv, %4 - br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !86 + br i1 %cmp1, label %for.body, label %if.end.loopexit, !llvm.loop !92 if.end.loopexit: ; preds = %for.body br label %if.end @@ -987,21 +1206,21 @@ if.end: ; preds = %if.end.loopexit, %m } ; Function Attrs: nounwind uwtable -define void @_Z17printTensorValuesPv(i8* %tensor_ptr) local_unnamed_addr #0 { +define void @_Z17printTensorValuesPv(i8* %tensor_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 %data_type = bitcast i8* %tensor_ptr to i32* - %0 = load i32, i32* %data_type, align 8, !tbaa !67 + %0 = load i32, i32* %data_type, align 8, !tbaa !70 %cmp = icmp eq i32 %0, 0 br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %1 = bitcast i8* %host_data to float** - %2 = load float*, float** %1, align 8, !tbaa !66 - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %2 = load float*, float** %1, align 8, !tbaa !68 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %3 = bitcast i8* %num_elems to i64* - %4 = load i64, i64* %3, align 8, !tbaa !64 + %4 = load i64, i64* %3, align 8, !tbaa !67 %cmp112 = icmp eq i64 %4, 0 br i1 %cmp112, label %if.end, label %for.body.preheader @@ -1012,12 +1231,12 @@ for.body: ; preds = %for.body, %for.body %conv14 = phi i64 [ %conv, %for.body ], [ 0, %for.body.preheader ] %i.013 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds float, float* %2, i64 %conv14 - %5 = load float, float* %arrayidx, align 4, !tbaa !68 + %5 = load float, float* %arrayidx, align 4, !tbaa !71 %conv2 = fpext float %5 to double - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.9, i64 0, i64 0), double %conv2) + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str.8, i64 0, i64 0), double %conv2) %inc = add i32 %i.013, 1 %conv = zext i32 %inc to i64 - %6 = load i64, i64* %3, align 8, !tbaa !64 + %6 = load i64, i64* %3, align 8, !tbaa !67 %cmp1 = icmp ult i64 %conv, %6 br i1 %cmp1, label %for.body, label %if.end.loopexit @@ -1025,25 +1244,25 @@ if.end.loopexit: ; preds = %for.body br label %if.end if.end: ; preds = %if.end.loopexit, %if.then, %entry - %putchar = tail call i32 @putchar(i32 10) #7 + %putchar = tail call i32 @putchar(i32 10) #2 ret void } ; Function Attrs: nounwind uwtable -define void @_Z15printTensorDimsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #0 { +define void @_Z15printTensorDimsPv(i8* nocapture readonly %tensor_ptr) local_unnamed_addr #3 { entry: - %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 + %num_elems = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 %0 = bitcast i8* %num_elems to i64* - %1 = load i64, i64* %0, align 8, !tbaa !64 - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.11, i64 0, i64 0), i64 %1) - %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 64 + %1 = load i64, i64* %0, align 8, !tbaa !67 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.10, i64 0, i64 0), i64 %1) + %dims = getelementptr inbounds i8, i8* %tensor_ptr, i64 88 %num_dims = bitcast i8* %dims to i32* - %2 = load i32, i32* %num_dims, align 8, !tbaa !61 + %2 = load i32, i32* %num_dims, align 8, !tbaa !64 %cmp10 = icmp sgt i32 %2, 0 br i1 %cmp10, label %for.body.lr.ph, label %for.cond.cleanup for.body.lr.ph: ; preds = %entry - %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 72 + %dim_sizes = getelementptr inbounds i8, i8* %tensor_ptr, i64 96 %3 = bitcast i8* %dim_sizes to i64** br label %for.body @@ -1055,32 +1274,32 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo for.body: ; preds = %for.body, %for.body.lr.ph %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ] - %4 = load i64*, i64** %3, align 8, !tbaa !62 + %4 = load i64*, i64** %3, align 8, !tbaa !65 %arrayidx = getelementptr inbounds i64, i64* %4, i64 %indvars.iv - %5 = load i64, i64* %arrayidx, align 8, !tbaa !63 + %5 = load i64, i64* %arrayidx, align 8, !tbaa !66 %6 = trunc i64 %indvars.iv to i32 - %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.12, i64 0, i64 0), i32 %6, i64 %5) + %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.11, i64 0, i64 0), i32 %6, i64 %5) %indvars.iv.next = add nuw i64 %indvars.iv, 1 - %7 = load i32, i32* %num_dims, align 8, !tbaa !61 + %7 = load i32, i32* %num_dims, align 8, !tbaa !64 %8 = sext i32 %7 to i64 %cmp = icmp slt i64 %indvars.iv.next, %8 br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit } ; Function Attrs: nounwind uwtable -define void @_Z14compareTensorsPvS_(i8* %tensor1_ptr, i8* %tensor2_ptr) local_unnamed_addr #0 { +define void @_Z14compareTensorsPvS_(i8* %tensor1_ptr, i8* %tensor2_ptr) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor1_ptr, i32 0) #7 - tail call void @hpvm_request_tensor(i8* %tensor2_ptr, i32 0) #7 - %host_data = getelementptr inbounds i8, i8* %tensor1_ptr, i64 32 + tail call void @hpvm_request_tensor(i8* %tensor1_ptr, i32 0) #2 + tail call void @hpvm_request_tensor(i8* %tensor2_ptr, i32 0) #2 + %host_data = getelementptr inbounds i8, i8* %tensor1_ptr, i64 48 %0 = bitcast i8* %host_data to float** - %1 = load float*, float** %0, align 8, !tbaa !66 - %host_data1 = getelementptr inbounds i8, i8* %tensor2_ptr, i64 32 + %1 = load float*, float** %0, align 8, !tbaa !68 + %host_data1 = getelementptr inbounds i8, i8* %tensor2_ptr, i64 48 %2 = bitcast i8* %host_data1 to float** - %3 = load float*, float** %2, align 8, !tbaa !66 - %num_elems = getelementptr inbounds i8, i8* %tensor1_ptr, i64 48 + %3 = load float*, float** %2, align 8, !tbaa !68 + %num_elems = getelementptr inbounds i8, i8* %tensor1_ptr, i64 72 %4 = bitcast i8* %num_elems to i64* - %5 = load i64, i64* %4, align 8, !tbaa !64 + %5 = load i64, i64* %4, align 8, !tbaa !67 %cmp17 = icmp eq i64 %5, 0 br i1 %cmp17, label %for.cond.cleanup, label %for.body.preheader @@ -1097,15 +1316,15 @@ for.body: ; preds = %for.inc, %for.body. %conv19 = phi i64 [ %conv, %for.inc ], [ 0, %for.body.preheader ] %i.018 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds float, float* %1, i64 %conv19 - %6 = load float, float* %arrayidx, align 4, !tbaa !68 + %6 = load float, float* %arrayidx, align 4, !tbaa !71 %arrayidx3 = getelementptr inbounds float, float* %3, i64 %conv19 - %7 = load float, float* %arrayidx3, align 4, !tbaa !68 + %7 = load float, float* %arrayidx3, align 4, !tbaa !71 %cmp4 = fcmp fast une float %6, %7 br i1 %cmp4, label %if.then, label %for.inc if.then: ; preds = %for.body - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.13, i64 0, i64 0), i32 %i.018) - tail call void @abort() #8 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.12, i64 0, i64 0), i32 %i.018) + tail call void @abort() #13 unreachable for.inc: ; preds = %for.body @@ -1116,12 +1335,12 @@ for.inc: ; preds = %for.body } ; Function Attrs: nounwind uwtable -define void @_Z13compareValuesPvPfm(i8* %tensor_ptr, float* nocapture readonly %data, i64 %num_elems) local_unnamed_addr #0 { +define void @_Z13compareValuesPvPfm(i8* %tensor_ptr, float* nocapture readonly %data, i64 %num_elems) local_unnamed_addr #3 { entry: - tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #7 - %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 32 + tail call void @hpvm_request_tensor(i8* %tensor_ptr, i32 0) #2 + %host_data = getelementptr inbounds i8, i8* %tensor_ptr, i64 48 %0 = bitcast i8* %host_data to float** - %1 = load float*, float** %0, align 8, !tbaa !66 + %1 = load float*, float** %0, align 8, !tbaa !68 %cmp11 = icmp eq i64 %num_elems, 0 br i1 %cmp11, label %for.cond.cleanup, label %for.body.preheader @@ -1143,21 +1362,21 @@ for.body: ; preds = %for.cond, %for.body %conv13 = phi i64 [ %conv, %for.cond ], [ 0, %for.body.preheader ] %i.012 = phi i32 [ %inc, %for.cond ], [ 0, %for.body.preheader ] %arrayidx = getelementptr inbounds float, float* %1, i64 %conv13 - %2 = load float, float* %arrayidx, align 4, !tbaa !68 + %2 = load float, float* %arrayidx, align 4, !tbaa !71 %arrayidx2 = getelementptr inbounds float, float* %data, i64 %conv13 - %3 = load float, float* %arrayidx2, align 4, !tbaa !68 + %3 = load float, float* %arrayidx2, align 4, !tbaa !71 %cmp3 = fcmp fast une float %2, %3 %inc = add i32 %i.012, 1 br i1 %cmp3, label %if.then, label %for.cond if.then: ; preds = %for.body - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.14, i64 0, i64 0)) - tail call void @abort() #8 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.13, i64 0, i64 0)) + tail call void @abort() #13 unreachable } ; Function Attrs: nounwind uwtable -define i8* @_Z15readInputTensorPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #0 { +define i8* @_Z15readInputTensorPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #3 { entry: %mul = mul nsw i32 %dim2_size, %dim1_size %mul1 = mul nsw i32 %mul, %dim3_size @@ -1167,39 +1386,40 @@ entry: %mul5 = mul nsw i32 %mul4, %dim3_size %mul6 = mul nsw i32 %mul5, %dim4_size %conv = sext i32 %mul2 to i64 - %call = tail call noalias i8* @malloc(i64 %conv) #7 + %call = tail call noalias i8* @malloc(i64 %conv) #2 %mul9 = shl nsw i64 %conv, 2 - %call10 = tail call noalias i8* @malloc(i64 %mul9) #7 + %call10 = tail call noalias i8* @malloc(i64 %mul9) #2 %0 = bitcast i8* %call10 to float* - %call11 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) + %call11 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) %cmp = icmp eq %struct._IO_FILE* %call11, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call12 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.16, i64 0, i64 0), i8* %file_name) - tail call void @abort() #8 + %call12 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable if.end: ; preds = %entry %call14 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call11, i64 16, i32 1) %call17 = tail call i64 @fread(i8* %call, i64 1, i64 %conv, %struct._IO_FILE* nonnull %call11) - %cmp1962 = icmp eq i32 %mul2, 0 - br i1 %cmp1962, label %for.cond.cleanup, label %for.body.preheader + %call18 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call11) + %cmp2060 = icmp eq i32 %mul2, 0 + br i1 %cmp2060, label %for.cond.cleanup, label %for.body.preheader for.body.preheader: ; preds = %if.end %1 = icmp ugt i64 %conv, 1 %umax = select i1 %1, i64 %conv, i64 1 %min.iters.check = icmp ult i64 %umax, 8 - br i1 %min.iters.check, label %for.body.preheader68, label %min.iters.checked + br i1 %min.iters.check, label %for.body.preheader64, label %min.iters.checked -for.body.preheader68: ; preds = %middle.block, %min.iters.checked, %for.body.preheader - %i.063.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] +for.body.preheader64: ; preds = %middle.block, %min.iters.checked, %for.body.preheader + %i.061.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] br label %for.body min.iters.checked: ; preds = %for.body.preheader %n.vec = and i64 %umax, -8 %cmp.zero = icmp eq i64 %n.vec, 0 - br i1 %cmp.zero, label %for.body.preheader68, label %vector.body.preheader + br i1 %cmp.zero, label %for.body.preheader64, label %vector.body.preheader vector.body.preheader: ; preds = %min.iters.checked %2 = add nsw i64 %n.vec, -8 @@ -1213,19 +1433,19 @@ vector.body.prol.preheader: ; preds = %vector.body.prehead vector.body.prol: ; preds = %vector.body.prol.preheader %5 = bitcast i8* %call to <4 x i8>* - %wide.load.prol = load <4 x i8>, <4 x i8>* %5, align 1, !tbaa !87 + %wide.load.prol = load <4 x i8>, <4 x i8>* %5, align 1, !tbaa !93 %6 = getelementptr i8, i8* %call, i64 4 %7 = bitcast i8* %6 to <4 x i8>* - %wide.load67.prol = load <4 x i8>, <4 x i8>* %7, align 1, !tbaa !87 + %wide.load63.prol = load <4 x i8>, <4 x i8>* %7, align 1, !tbaa !93 %8 = uitofp <4 x i8> %wide.load.prol to <4 x float> - %9 = uitofp <4 x i8> %wide.load67.prol to <4 x float> + %9 = uitofp <4 x i8> %wide.load63.prol to <4 x float> %10 = fmul fast <4 x float> %8, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %11 = fmul fast <4 x float> %9, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %12 = bitcast i8* %call10 to <4 x float>* - store <4 x float> %10, <4 x float>* %12, align 4, !tbaa !68 + store <4 x float> %10, <4 x float>* %12, align 4, !tbaa !71 %13 = getelementptr i8, i8* %call10, i64 16 %14 = bitcast i8* %13 to <4 x float>* - store <4 x float> %11, <4 x float>* %14, align 4, !tbaa !68 + store <4 x float> %11, <4 x float>* %14, align 4, !tbaa !71 br label %vector.body.prol.loopexit vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader @@ -1240,216 +1460,385 @@ vector.body: ; preds = %vector.body, %vecto %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] %16 = getelementptr inbounds i8, i8* %call, i64 %index %17 = bitcast i8* %16 to <4 x i8>* - %wide.load = load <4 x i8>, <4 x i8>* %17, align 1, !tbaa !87 + %wide.load = load <4 x i8>, <4 x i8>* %17, align 1, !tbaa !93 %18 = getelementptr i8, i8* %16, i64 4 %19 = bitcast i8* %18 to <4 x i8>* - %wide.load67 = load <4 x i8>, <4 x i8>* %19, align 1, !tbaa !87 + %wide.load63 = load <4 x i8>, <4 x i8>* %19, align 1, !tbaa !93 %20 = uitofp <4 x i8> %wide.load to <4 x float> - %21 = uitofp <4 x i8> %wide.load67 to <4 x float> + %21 = uitofp <4 x i8> %wide.load63 to <4 x float> %22 = fmul fast <4 x float> %20, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %23 = fmul fast <4 x float> %21, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %24 = getelementptr inbounds float, float* %0, i64 %index %25 = bitcast float* %24 to <4 x float>* - store <4 x float> %22, <4 x float>* %25, align 4, !tbaa !68 + store <4 x float> %22, <4 x float>* %25, align 4, !tbaa !71 %26 = getelementptr float, float* %24, i64 4 %27 = bitcast float* %26 to <4 x float>* - store <4 x float> %23, <4 x float>* %27, align 4, !tbaa !68 + store <4 x float> %23, <4 x float>* %27, align 4, !tbaa !71 %index.next = add i64 %index, 8 %28 = getelementptr inbounds i8, i8* %call, i64 %index.next %29 = bitcast i8* %28 to <4 x i8>* - %wide.load.1 = load <4 x i8>, <4 x i8>* %29, align 1, !tbaa !87 + %wide.load.1 = load <4 x i8>, <4 x i8>* %29, align 1, !tbaa !93 %30 = getelementptr i8, i8* %28, i64 4 %31 = bitcast i8* %30 to <4 x i8>* - %wide.load67.1 = load <4 x i8>, <4 x i8>* %31, align 1, !tbaa !87 + %wide.load63.1 = load <4 x i8>, <4 x i8>* %31, align 1, !tbaa !93 %32 = uitofp <4 x i8> %wide.load.1 to <4 x float> - %33 = uitofp <4 x i8> %wide.load67.1 to <4 x float> + %33 = uitofp <4 x i8> %wide.load63.1 to <4 x float> %34 = fmul fast <4 x float> %32, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %35 = fmul fast <4 x float> %33, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000> %36 = getelementptr inbounds float, float* %0, i64 %index.next %37 = bitcast float* %36 to <4 x float>* - store <4 x float> %34, <4 x float>* %37, align 4, !tbaa !68 + store <4 x float> %34, <4 x float>* %37, align 4, !tbaa !71 %38 = getelementptr float, float* %36, i64 4 %39 = bitcast float* %38 to <4 x float>* - store <4 x float> %35, <4 x float>* %39, align 4, !tbaa !68 + store <4 x float> %35, <4 x float>* %39, align 4, !tbaa !71 %index.next.1 = add i64 %index, 16 %40 = icmp eq i64 %index.next.1, %n.vec - br i1 %40, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !88 + br i1 %40, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !94 middle.block.unr-lcssa: ; preds = %vector.body br label %middle.block middle.block: ; preds = %middle.block.unr-lcssa, %vector.body.prol.loopexit %cmp.n = icmp eq i64 %umax, %n.vec - br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader68 - -for.cond.cleanup.loopexit.loopexit: ; preds = %for.body - br label %for.cond.cleanup.loopexit + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader64 -for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.loopexit.loopexit, %middle.block - %arrayidx22.phi.trans.insert = getelementptr inbounds i8, i8* %call10, i64 40 - %.phi.trans.insert = bitcast i8* %arrayidx22.phi.trans.insert to float* - %.pre = load float, float* %.phi.trans.insert, align 4, !tbaa !68 - %phitmp = fpext float %.pre to double +for.cond.cleanup.loopexit: ; preds = %for.body br label %for.cond.cleanup -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %if.end - %41 = phi double [ %phitmp, %for.cond.cleanup.loopexit ], [ undef, %if.end ] - %call24 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.17, i64 0, i64 0), i32 10, double %41) - %conv25 = sext i32 %dim1_size to i64 - %conv26 = sext i32 %dim2_size to i64 - %conv27 = sext i32 %dim3_size to i64 - %conv28 = sext i32 %dim4_size to i64 - %call29 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv25, i64 %conv26, i64 %conv27, i64 %conv28) #7 - %conv30 = sext i32 %mul6 to i64 - tail call void @initTensorData(i8* %call29, i8* %call10, i64 %conv30) #7 - ret i8* %call29 - -for.body: ; preds = %for.body, %for.body.preheader68 - %i.063 = phi i64 [ %inc, %for.body ], [ %i.063.ph, %for.body.preheader68 ] - %arrayidx = getelementptr inbounds i8, i8* %call, i64 %i.063 - %42 = load i8, i8* %arrayidx, align 1, !tbaa !87 - %conv20 = uitofp i8 %42 to float - %div = fmul fast float %conv20, 0x3F70101020000000 - %arrayidx21 = getelementptr inbounds float, float* %0, i64 %i.063 - store float %div, float* %arrayidx21, align 4, !tbaa !68 - %inc = add nuw i64 %i.063, 1 - %cmp19 = icmp ult i64 %inc, %conv - br i1 %cmp19, label %for.body, label %for.cond.cleanup.loopexit.loopexit, !llvm.loop !89 +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %middle.block, %if.end + %conv23 = sext i32 %dim1_size to i64 + %conv24 = sext i32 %dim2_size to i64 + %conv25 = sext i32 %dim3_size to i64 + %conv26 = sext i32 %dim4_size to i64 + %call27 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv23, i64 %conv24, i64 %conv25, i64 %conv26) #2 + %conv28 = sext i32 %mul6 to i64 + tail call void @initTensorData(i8* %call27, i8* %call10, i64 %conv28) #2 + ret i8* %call27 + +for.body: ; preds = %for.body, %for.body.preheader64 + %i.061 = phi i64 [ %inc, %for.body ], [ %i.061.ph, %for.body.preheader64 ] + %arrayidx = getelementptr inbounds i8, i8* %call, i64 %i.061 + %41 = load i8, i8* %arrayidx, align 1, !tbaa !93 + %conv21 = uitofp i8 %41 to float + %div = fmul fast float %conv21, 0x3F70101020000000 + %arrayidx22 = getelementptr inbounds float, float* %0, i64 %i.061 + store float %div, float* %arrayidx22, align 4, !tbaa !71 + %inc = add nuw i64 %i.061, 1 + %cmp20 = icmp ult i64 %inc, %conv + br i1 %cmp20, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !95 } ; Function Attrs: nounwind -declare noalias i8* @malloc(i64) local_unnamed_addr #2 +declare noalias i8* @malloc(i64) local_unnamed_addr #1 ; Function Attrs: nounwind -declare i32 @fseek(%struct._IO_FILE* nocapture, i64, i32) local_unnamed_addr #2 +declare i32 @fseek(%struct._IO_FILE* nocapture, i64, i32) local_unnamed_addr #1 ; Function Attrs: nounwind -declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #2 +declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) local_unnamed_addr #1 -declare i8* @create4DTensor(i32, i32, i64, i64, i64, i64) local_unnamed_addr #3 +declare i8* @create4DTensor(i32, i32, i64, i64, i64, i64) local_unnamed_addr #0 -declare void @initTensorData(i8*, i8*, i64) local_unnamed_addr #3 +declare void @initTensorData(i8*, i8*, i64) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -define %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #0 { +define %struct.Tensor* @_Z21readTrainedWeightsCPUPKciiiii(i8* %file_name, i32 %data_type, i32 %dim1_size, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #3 { entry: %mul = mul nsw i32 %dim2_size, %dim1_size %mul1 = mul nsw i32 %mul, %dim3_size %mul2 = mul nsw i32 %mul1, %dim4_size %conv = sext i32 %mul2 to i64 - %mul7 = shl nsw i64 %conv, 2 - %call = tail call noalias i8* @malloc(i64 %mul7) #7 - %call8 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) - %cmp = icmp eq %struct._IO_FILE* %call8, null + %mul3 = shl i32 %dim1_size, 2 + %mul4 = mul nsw i32 %mul3, %dim2_size + %mul5 = mul nsw i32 %mul4, %dim3_size + %mul6 = mul nsw i32 %mul5, %dim4_size + %conv7 = sext i32 %mul6 to i64 + %mul8 = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul8) #2 + %call9 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call9, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call9 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.16, i64 0, i64 0), i8* %file_name) - tail call void @abort() #8 + %call10 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable if.end: ; preds = %entry - %0 = bitcast i8* %call to float* - %mul3 = shl i32 %dim1_size, 2 + %call12 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call9, i64 0, i32 1) + %call13 = tail call i64 @fread(i8* %call, i64 1, i64 %conv7, %struct._IO_FILE* nonnull %call9) + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.16, i64 0, i64 0), i64 %conv7, i64 %call13) + %call15 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call9) + %conv16 = sext i32 %dim1_size to i64 + %conv17 = sext i32 %dim2_size to i64 + %conv18 = sext i32 %dim3_size to i64 + %conv19 = sext i32 %dim4_size to i64 + %call20 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv16, i64 %conv17, i64 %conv18, i64 %conv19) #2 + %0 = bitcast i8* %call20 to %struct.Tensor* + tail call void @initTensorData(i8* %call20, i8* %call, i64 %conv7) #2 + tail call void @free(i8* %call) #2 + ret %struct.Tensor* %0 +} + +; Function Attrs: nounwind +declare void @free(i8* nocapture) local_unnamed_addr #1 + +; Function Attrs: nounwind uwtable +define %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %file_name, i32 %data_type, i64 %dim1_size, i64 %dim2_size, i64 %dim3_size, i64 %dim4_size) local_unnamed_addr #3 { +entry: + %mul3 = shl i64 %dim1_size, 2 + %mul4 = mul i64 %mul3, %dim2_size + %mul5 = mul i64 %mul4, %dim3_size + %mul6 = mul i64 %mul5, %dim4_size + %call = tail call noalias i8* @malloc(i64 %mul6) #2 + %call8 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.17, i64 0, i64 0), i64 %mul6) + %call9 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call9, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call10 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %call12 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call9, i64 0, i32 1) + %call13 = tail call i64 @fread(i8* %call, i64 1, i64 %mul6, %struct._IO_FILE* nonnull %call9) + %call14 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call9) + %call15 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %dim1_size, i64 %dim2_size, i64 %dim3_size, i64 %dim4_size) #2 + %0 = bitcast i8* %call15 to %struct.Tensor* + tail call void @initTensorData(i8* %call15, i8* %call, i64 %mul6) #2 + tail call void @free(i8* %call) #2 + ret %struct.Tensor* %0 +} + +; Function Attrs: nounwind uwtable +define %struct.Tensor* @_Z14readInputBatchPKciiiiii(i8* %file_name, i32 %data_type, i32 %start, i32 %end, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size) local_unnamed_addr #3 { +entry: + %sub = sub nsw i32 %end, %start + %mul = mul nsw i32 %sub, %dim2_size + %mul1 = mul nsw i32 %mul, %dim3_size + %mul2 = mul nsw i32 %mul1, %dim4_size + %conv = sext i32 %mul2 to i64 + %mul3 = shl i32 %sub, 2 %mul4 = mul nsw i32 %mul3, %dim2_size %mul5 = mul nsw i32 %mul4, %dim3_size %mul6 = mul nsw i32 %mul5, %dim4_size - %call11 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call8, i64 0, i32 1) - %conv12 = sext i32 %mul6 to i64 - %call13 = tail call i64 @fread(i8* %call, i64 1, i64 %conv12, %struct._IO_FILE* nonnull %call8) - %conv14 = sext i32 %dim1_size to i64 - %conv15 = sext i32 %dim2_size to i64 - %conv16 = sext i32 %dim3_size to i64 - %conv17 = sext i32 %dim4_size to i64 - %call18 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv14, i64 %conv15, i64 %conv16, i64 %conv17) #7 - %1 = bitcast i8* %call18 to %struct.Tensor* - tail call void @initTensorData(i8* %call18, i8* %call, i64 %conv12) #7 - tail call void @hpvm_request_tensor(i8* %call18, i32 0) #7 - %host_data.i = getelementptr inbounds i8, i8* %call18, i64 32 - %2 = bitcast i8* %host_data.i to float** - %3 = load float*, float** %2, align 8, !tbaa !66 - %cmp11.i = icmp eq i32 %mul2, 0 - br i1 %cmp11.i, label %_Z13compareValuesPvPfm.exit, label %for.body.i.preheader - -for.body.i.preheader: ; preds = %if.end - br label %for.body.i - -for.cond.i: ; preds = %for.body.i - %conv.i = zext i32 %inc.i to i64 - %cmp.i = icmp ult i64 %conv.i, %conv - br i1 %cmp.i, label %for.body.i, label %_Z13compareValuesPvPfm.exit.loopexit - -for.body.i: ; preds = %for.cond.i, %for.body.i.preheader - %conv13.i = phi i64 [ %conv.i, %for.cond.i ], [ 0, %for.body.i.preheader ] - %i.012.i = phi i32 [ %inc.i, %for.cond.i ], [ 0, %for.body.i.preheader ] - %arrayidx.i = getelementptr inbounds float, float* %3, i64 %conv13.i - %4 = load float, float* %arrayidx.i, align 4, !tbaa !68 - %arrayidx2.i = getelementptr inbounds float, float* %0, i64 %conv13.i - %5 = load float, float* %arrayidx2.i, align 4, !tbaa !68 - %cmp3.i = fcmp fast une float %4, %5 - %inc.i = add i32 %i.012.i, 1 - br i1 %cmp3.i, label %if.then.i, label %for.cond.i - -if.then.i: ; preds = %for.body.i - %call.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([21 x i8], [21 x i8]* @.str.14, i64 0, i64 0)) #7 - tail call void @abort() #8 + %conv7 = sext i32 %mul6 to i64 + %mul8 = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul8) #2 + %call13 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call13, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 unreachable -_Z13compareValuesPvPfm.exit.loopexit: ; preds = %for.cond.i - br label %_Z13compareValuesPvPfm.exit +if.end: ; preds = %entry + %mul9 = shl i32 %start, 2 + %mul10 = mul nsw i32 %mul9, %dim2_size + %mul11 = mul nsw i32 %mul10, %dim3_size + %mul12 = mul nsw i32 %mul11, %dim4_size + %conv15 = sext i32 %mul12 to i64 + %call16 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call13, i64 %conv15, i32 0) + %call17 = tail call i64 @fread(i8* %call, i64 1, i64 %conv7, %struct._IO_FILE* nonnull %call13) + %call18 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call13) + %conv19 = sext i32 %sub to i64 + %conv20 = sext i32 %dim2_size to i64 + %conv21 = sext i32 %dim3_size to i64 + %conv22 = sext i32 %dim4_size to i64 + %call23 = tail call i8* @create4DTensor(i32 %data_type, i32 0, i64 %conv19, i64 %conv20, i64 %conv21, i64 %conv22) #2 + %0 = bitcast i8* %call23 to %struct.Tensor* + tail call void @initTensorData(i8* %call23, i8* %call, i64 %conv7) #2 + tail call void @free(i8* %call) #2 + ret %struct.Tensor* %0 +} + +; Function Attrs: nounwind uwtable +define i8* @_Z14copyInputBatchPKciiiiiPv(i8* %file_name, i32 %start, i32 %end, i32 %dim2_size, i32 %dim3_size, i32 %dim4_size, i8* returned %inputTensor_ptr) local_unnamed_addr #3 { +entry: + %0 = bitcast i8* %inputTensor_ptr to %struct.Tensor* + %sub = sub nsw i32 %end, %start + %mul = mul nsw i32 %sub, %dim2_size + %mul1 = mul nsw i32 %mul, %dim3_size + %mul2 = mul nsw i32 %mul1, %dim4_size + %conv = sext i32 %mul2 to i64 + %mul3 = shl i32 %sub, 2 + %mul4 = mul nsw i32 %mul3, %dim2_size + %mul5 = mul nsw i32 %mul4, %dim3_size + %mul6 = mul nsw i32 %mul5, %dim4_size + %conv7 = sext i32 %mul6 to i64 + %mul8 = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul8) #2 + %call13 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call13, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call14 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.15, i64 0, i64 0), i8* %file_name) + tail call void @abort() #13 + unreachable -_Z13compareValuesPvPfm.exit: ; preds = %_Z13compareValuesPvPfm.exit.loopexit, %if.end - ret %struct.Tensor* %1 +if.end: ; preds = %entry + %mul9 = shl i32 %start, 2 + %mul10 = mul nsw i32 %mul9, %dim2_size + %mul11 = mul nsw i32 %mul10, %dim3_size + %mul12 = mul nsw i32 %mul11, %dim4_size + %conv15 = sext i32 %mul12 to i64 + %call16 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call13, i64 %conv15, i32 0) + %call17 = tail call i64 @fread(i8* %call, i64 1, i64 %conv7, %struct._IO_FILE* nonnull %call13) + %call18 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call13) + tail call void @initTensorData(i8* %inputTensor_ptr, i8* %call, i64 %conv7) #2 + tail call void @free(i8* %call) #2 + %dims = getelementptr inbounds i8, i8* %inputTensor_ptr, i64 88 + %num_dims = bitcast i8* %dims to i32* + %1 = load i32, i32* %num_dims, align 8, !tbaa !64 + %call19 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str.18, i64 0, i64 0), i32 %1) + %host_data = getelementptr inbounds i8, i8* %inputTensor_ptr, i64 48 + %2 = bitcast i8* %host_data to i8** + %3 = load i8*, i8** %2, align 8, !tbaa !68 + %cmp20 = icmp eq i8* %3, null + br i1 %cmp20, label %if.then22, label %lor.lhs.false + +lor.lhs.false: ; preds = %if.end + %gpu_data = getelementptr inbounds i8, i8* %inputTensor_ptr, i64 56 + %4 = bitcast i8* %gpu_data to i8** + %5 = load i8*, i8** %4, align 8, !tbaa !58 + %cmp21 = icmp eq i8* %5, null + br i1 %cmp21, label %if.then22, label %if.end24 + +if.then22: ; preds = %lor.lhs.false, %if.end + %puts = tail call i32 @puts(i8* getelementptr inbounds ([27 x i8], [27 x i8]* @str.78, i64 0, i64 0)) + br label %if.end24 + +if.end24: ; preds = %if.then22, %lor.lhs.false + tail call void @changeTensorPlacement(%struct.Tensor* nonnull %0, i32 0) #2 + ret i8* %inputTensor_ptr } +declare void @changeTensorPlacement(%struct.Tensor*, i32) local_unnamed_addr #0 + ; Function Attrs: nounwind uwtable -define noalias i8* @_Z10readLabelsPKci(i8* %labels_file, i32 %num_labels) local_unnamed_addr #0 { +define noalias i8* @_Z10readLabelsPKci(i8* %labels_file, i32 %num_labels) local_unnamed_addr #3 { entry: %conv = sext i32 %num_labels to i64 - %call = tail call noalias i8* @malloc(i64 %conv) #7 - %call1 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) + %call = tail call noalias i8* @malloc(i64 %conv) #2 + %call1 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) %cmp = icmp eq %struct._IO_FILE* %call1, null br i1 %cmp, label %if.then, label %if.end if.then: ; preds = %entry - %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.18, i64 0, i64 0), i8* %labels_file) - tail call void @abort() #8 + %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 unreachable if.end: ; preds = %entry %call5 = tail call i64 @fread(i8* %call, i64 1, i64 %conv, %struct._IO_FILE* nonnull %call1) - %call6 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.19, i64 0, i64 0), i64 %call5) + %call6 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call1) + ret i8* %call +} + +; Function Attrs: nounwind uwtable +define noalias i32* @_Z11readLabels3PKci(i8* %labels_file, i32 %num_labels) local_unnamed_addr #3 { +entry: + %conv = sext i32 %num_labels to i64 + %mul = shl nsw i64 %conv, 2 + %call = tail call noalias i8* @malloc(i64 %mul) #2 + %call1 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call1, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call2 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %0 = bitcast i8* %call to i32* + %call5 = tail call i64 @fread(i8* %call, i64 1, i64 %mul, %struct._IO_FILE* nonnull %call1) + %call6 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call1) + ret i32* %0 +} + +; Function Attrs: nounwind uwtable +define noalias i8* @_Z15readLabelsBatchPKcii(i8* %labels_file, i32 %start, i32 %end) local_unnamed_addr #3 { +entry: + %sub = sub nsw i32 %end, %start + %conv2 = sext i32 %sub to i64 + %call = tail call noalias i8* @malloc(i64 %conv2) #2 + %call4 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call4, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %conv = sext i32 %start to i64 + %call7 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call4, i64 %conv, i32 0) + %call10 = tail call i64 @fread(i8* %call, i64 1, i64 %conv2, %struct._IO_FILE* nonnull %call4) + %call11 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call4) ret i8* %call } ; Function Attrs: nounwind uwtable -define void @_Z15computeAccuracyPciPv(i8* %labels_file, i32 %num_labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #0 { +define noalias i32* @_Z16readLabelsBatch3PKcii(i8* %labels_file, i32 %start, i32 %end) local_unnamed_addr #3 { +entry: + %sub = sub nsw i32 %end, %start + %conv2 = sext i32 %sub to i64 + %mul3 = shl nsw i64 %conv2, 2 + %call = tail call noalias i8* @malloc(i64 %mul3) #2 + %call4 = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call4, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %0 = bitcast i8* %call to i32* + %mul = shl i32 %start, 2 + %conv6 = sext i32 %mul to i64 + %call7 = tail call i32 @fseek(%struct._IO_FILE* nonnull %call4, i64 %conv6, i32 0) + %call10 = tail call i64 @fread(i8* %call, i64 1, i64 %mul3, %struct._IO_FILE* nonnull %call4) + %call11 = tail call i32 @fclose(%struct._IO_FILE* nonnull %call4) + ret i32* %0 +} + +; Function Attrs: nounwind uwtable +define void @_Z15computeAccuracyPKciPv(i8* %labels_file, i32 %num_labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #3 { entry: %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 %print_str = alloca %"class.std::__cxx11::basic_string", align 8 %conv.i = sext i32 %num_labels to i64 - %call.i = tail call noalias i8* @malloc(i64 %conv.i) #7 - %call1.i = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) #7 + %call.i = tail call noalias i8* @malloc(i64 %conv.i) #2 + %call1.i = tail call %struct._IO_FILE* @fopen(i8* %labels_file, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) #2 %cmp.i = icmp eq %struct._IO_FILE* %call1.i, null br i1 %cmp.i, label %if.then.i, label %_Z10readLabelsPKci.exit if.then.i: ; preds = %entry - %call2.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.18, i64 0, i64 0), i8* %labels_file) #7 - tail call void @abort() #8 + %call2.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %labels_file) #2 + tail call void @abort() #13 unreachable _Z10readLabelsPKci.exit: ; preds = %entry - %call5.i = tail call i64 @fread(i8* %call.i, i64 1, i64 %conv.i, %struct._IO_FILE* nonnull %call1.i) #7 - %call6.i = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.19, i64 0, i64 0), i64 %call5.i) #7 - %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 72 + %call5.i = tail call i64 @fread(i8* %call.i, i64 1, i64 %conv.i, %struct._IO_FILE* nonnull %call1.i) #2 + %call6.i = tail call i32 @fclose(%struct._IO_FILE* nonnull %call1.i) #2 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 %0 = bitcast i8* %dim_sizes to i64** - %1 = load i64*, i64** %0, align 8, !tbaa !62 - %2 = load i64, i64* %1, align 8, !tbaa !63 + %1 = load i64*, i64** %0, align 8, !tbaa !65 + %2 = load i64, i64* %1, align 8, !tbaa !66 %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 - %3 = load i64, i64* %arrayidx3, align 8, !tbaa !63 - %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 32 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !66 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 %4 = bitcast i8* %host_data to float** - %5 = load float*, float** %4, align 8, !tbaa !66 + %5 = load float*, float** %4, align 8, !tbaa !68 %cmp92 = icmp eq i64 %2, 0 br i1 %cmp92, label %for.cond.cleanup, label %for.cond4.preheader.preheader @@ -1461,86 +1850,86 @@ for.cond4.preheader: ; preds = %for.cond4.preheader %num_errors.094 = phi i32 [ %num_errors.0.inc21, %for.cond4.preheader ], [ 0, %for.cond4.preheader.preheader ] %mul = mul i64 %indvars.iv, %3 %arrayidx10 = getelementptr inbounds float, float* %5, i64 %mul - %6 = load float, float* %arrayidx10, align 4, !tbaa !68 + %6 = load float, float* %arrayidx10, align 4, !tbaa !71 %add14 = add i64 %mul, 1 %arrayidx15 = getelementptr inbounds float, float* %5, i64 %add14 - %7 = load float, float* %arrayidx15, align 4, !tbaa !68 + %7 = load float, float* %arrayidx15, align 4, !tbaa !71 %cmp16 = fcmp fast olt float %6, %7 %chosen.1 = zext i1 %cmp16 to i32 %conv9.1 = zext i1 %cmp16 to i64 %add.1 = add i64 %conv9.1, %mul %arrayidx10.1 = getelementptr inbounds float, float* %5, i64 %add.1 - %8 = load float, float* %arrayidx10.1, align 4, !tbaa !68 + %8 = load float, float* %arrayidx10.1, align 4, !tbaa !71 %add14.1 = add i64 %mul, 2 %arrayidx15.1 = getelementptr inbounds float, float* %5, i64 %add14.1 - %9 = load float, float* %arrayidx15.1, align 4, !tbaa !68 + %9 = load float, float* %arrayidx15.1, align 4, !tbaa !71 %cmp16.1 = fcmp fast olt float %8, %9 %chosen.1.1 = select i1 %cmp16.1, i32 2, i32 %chosen.1 %conv9.296 = zext i32 %chosen.1.1 to i64 %add.2 = add i64 %conv9.296, %mul %arrayidx10.2 = getelementptr inbounds float, float* %5, i64 %add.2 - %10 = load float, float* %arrayidx10.2, align 4, !tbaa !68 + %10 = load float, float* %arrayidx10.2, align 4, !tbaa !71 %add14.2 = add i64 %mul, 3 %arrayidx15.2 = getelementptr inbounds float, float* %5, i64 %add14.2 - %11 = load float, float* %arrayidx15.2, align 4, !tbaa !68 + %11 = load float, float* %arrayidx15.2, align 4, !tbaa !71 %cmp16.2 = fcmp fast olt float %10, %11 %chosen.1.2 = select i1 %cmp16.2, i32 3, i32 %chosen.1.1 %conv9.397 = zext i32 %chosen.1.2 to i64 %add.3 = add i64 %conv9.397, %mul %arrayidx10.3 = getelementptr inbounds float, float* %5, i64 %add.3 - %12 = load float, float* %arrayidx10.3, align 4, !tbaa !68 + %12 = load float, float* %arrayidx10.3, align 4, !tbaa !71 %add14.3 = add i64 %mul, 4 %arrayidx15.3 = getelementptr inbounds float, float* %5, i64 %add14.3 - %13 = load float, float* %arrayidx15.3, align 4, !tbaa !68 + %13 = load float, float* %arrayidx15.3, align 4, !tbaa !71 %cmp16.3 = fcmp fast olt float %12, %13 %chosen.1.3 = select i1 %cmp16.3, i32 4, i32 %chosen.1.2 %conv9.498 = zext i32 %chosen.1.3 to i64 %add.4 = add i64 %conv9.498, %mul %arrayidx10.4 = getelementptr inbounds float, float* %5, i64 %add.4 - %14 = load float, float* %arrayidx10.4, align 4, !tbaa !68 + %14 = load float, float* %arrayidx10.4, align 4, !tbaa !71 %add14.4 = add i64 %mul, 5 %arrayidx15.4 = getelementptr inbounds float, float* %5, i64 %add14.4 - %15 = load float, float* %arrayidx15.4, align 4, !tbaa !68 + %15 = load float, float* %arrayidx15.4, align 4, !tbaa !71 %cmp16.4 = fcmp fast olt float %14, %15 %chosen.1.4 = select i1 %cmp16.4, i32 5, i32 %chosen.1.3 %conv9.599 = zext i32 %chosen.1.4 to i64 %add.5 = add i64 %conv9.599, %mul %arrayidx10.5 = getelementptr inbounds float, float* %5, i64 %add.5 - %16 = load float, float* %arrayidx10.5, align 4, !tbaa !68 + %16 = load float, float* %arrayidx10.5, align 4, !tbaa !71 %add14.5 = add i64 %mul, 6 %arrayidx15.5 = getelementptr inbounds float, float* %5, i64 %add14.5 - %17 = load float, float* %arrayidx15.5, align 4, !tbaa !68 + %17 = load float, float* %arrayidx15.5, align 4, !tbaa !71 %cmp16.5 = fcmp fast olt float %16, %17 %chosen.1.5 = select i1 %cmp16.5, i32 6, i32 %chosen.1.4 %18 = zext i32 %chosen.1.5 to i64 %add.6 = add i64 %18, %mul %arrayidx10.6 = getelementptr inbounds float, float* %5, i64 %add.6 - %19 = load float, float* %arrayidx10.6, align 4, !tbaa !68 + %19 = load float, float* %arrayidx10.6, align 4, !tbaa !71 %add14.6 = add i64 %mul, 7 %arrayidx15.6 = getelementptr inbounds float, float* %5, i64 %add14.6 - %20 = load float, float* %arrayidx15.6, align 4, !tbaa !68 + %20 = load float, float* %arrayidx15.6, align 4, !tbaa !71 %cmp16.6 = fcmp fast olt float %19, %20 %chosen.1.6 = select i1 %cmp16.6, i32 7, i32 %chosen.1.5 %conv9.7 = sext i32 %chosen.1.6 to i64 %add.7 = add i64 %conv9.7, %mul %arrayidx10.7 = getelementptr inbounds float, float* %5, i64 %add.7 - %21 = load float, float* %arrayidx10.7, align 4, !tbaa !68 + %21 = load float, float* %arrayidx10.7, align 4, !tbaa !71 %add14.7 = add i64 %mul, 8 %arrayidx15.7 = getelementptr inbounds float, float* %5, i64 %add14.7 - %22 = load float, float* %arrayidx15.7, align 4, !tbaa !68 + %22 = load float, float* %arrayidx15.7, align 4, !tbaa !71 %cmp16.7 = fcmp fast olt float %21, %22 %chosen.1.7 = select i1 %cmp16.7, i32 8, i32 %chosen.1.6 %conv9.8 = sext i32 %chosen.1.7 to i64 %add.8 = add i64 %conv9.8, %mul %arrayidx10.8 = getelementptr inbounds float, float* %5, i64 %add.8 - %23 = load float, float* %arrayidx10.8, align 4, !tbaa !68 + %23 = load float, float* %arrayidx10.8, align 4, !tbaa !71 %add14.8 = add i64 %mul, 9 %arrayidx15.8 = getelementptr inbounds float, float* %5, i64 %add14.8 - %24 = load float, float* %arrayidx15.8, align 4, !tbaa !68 + %24 = load float, float* %arrayidx15.8, align 4, !tbaa !71 %cmp16.8 = fcmp fast olt float %23, %24 %chosen.1.8 = select i1 %cmp16.8, i32 9, i32 %chosen.1.7 %arrayidx17 = getelementptr inbounds i8, i8* %call.i, i64 %indvars.iv - %25 = load i8, i8* %arrayidx17, align 1, !tbaa !87 + %25 = load i8, i8* %arrayidx17, align 1, !tbaa !93 %conv18 = zext i8 %25 to i32 %not.cmp19 = icmp ne i32 %chosen.1.8, %conv18 %inc21 = zext i1 %not.cmp19 to i32 @@ -1562,31 +1951,31 @@ for.cond.cleanup: ; preds = %for.cond.cleanup.lo %mul31 = fmul fast double %div, 1.000000e+02 %conv32 = fptrunc double %mul31 to float %conv33 = fpext float %conv32 to double - %call34 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.20, i64 0, i64 0), double %conv33) - %call35 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.21, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.22, i64 0, i64 0)) + %call34 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv33) + %call35 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) %cmp36 = icmp eq %struct._IO_FILE* %call35, null br i1 %cmp36, label %if.end44, label %if.then37 if.then37: ; preds = %for.cond.cleanup %26 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* - call void @llvm.lifetime.start(i64 376, i8* nonnull %26) #7 + call void @llvm.lifetime.start(i64 376, i8* nonnull %26) #2 %27 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 %28 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0 - call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %28) #7 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %28) #2 %29 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !91 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !97 %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 - store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !93 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 - store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !96 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 - store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !97 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 %30 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %30, i8 0, i64 32, i32 8, i1 false) #7 + call void @llvm.memset.p0i8.i64(i8* %30, i8 0, i64 32, i32 8, i1 false) #2 %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 %32 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* - store i64 %31, i64* %32, align 16, !tbaa !91 + store i64 %31, i64* %32, align 16, !tbaa !97 %33 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 %34 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** %vtable.cast.i.i = inttoptr i64 %31 to i8* @@ -1595,79 +1984,79 @@ if.then37: ; preds = %for.cond.cleanup %vbase.offset.i.i = load i64, i64* %35, align 8 %add.ptr.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i %36 = bitcast i8* %add.ptr.i.i to i64* - store i64 %33, i64* %36, align 8, !tbaa !91 - %vtable3.i.i = load i8*, i8** %34, align 16, !tbaa !91 + store i64 %33, i64* %36, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %34, align 16, !tbaa !97 %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 %37 = bitcast i8* %vbase.offset.ptr4.i.i to i64* %vbase.offset5.i.i = load i64, i64* %37, align 8 %add.ptr6.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset5.i.i %38 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %38, %"class.std::basic_streambuf"* null) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !91 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %38, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !97 %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 %39 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 %40 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* - store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %40, align 16, !tbaa !91 + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %40, align 16, !tbaa !97 %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 %41 = bitcast i8** %_M_in_beg.i.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 48, i32 8, i1 false) #7 - call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %39, align 8, !tbaa !91 + call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %39, align 8, !tbaa !97 %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 - store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !98 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 %42 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 %43 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** - store %union.anon* %42, %union.anon** %43, align 8, !tbaa !103 + store %union.anon* %42, %union.anon** %43, align 8, !tbaa !109 %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 - store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !104 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 %.cast.i.i.i = bitcast %union.anon* %42 to i8* - store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !87 - %vtable.i = load i8*, i8** %34, align 16, !tbaa !91 + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %34, align 16, !tbaa !97 %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 %44 = bitcast i8* %vbase.offset.ptr.i to i64* %vbase.offset.i = load i64, i64* %44, align 8 %add.ptr2.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i %45 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* %46 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %45, %"class.std::basic_streambuf"* %46) #7 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %45, %"class.std::basic_streambuf"* %46) #2 %47 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* - %vtable.i74 = load i8*, i8** %34, align 16, !tbaa !91 + %vtable.i74 = load i8*, i8** %34, align 16, !tbaa !97 %vbase.offset.ptr.i75 = getelementptr i8, i8* %vtable.i74, i64 -24 %48 = bitcast i8* %vbase.offset.ptr.i75 to i64* %vbase.offset.i76 = load i64, i64* %48, align 8 %add.ptr.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i76 %_M_flags.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 %49 = bitcast i8* %_M_flags.i to i32* - %50 = load i32, i32* %49, align 4, !tbaa !105 + %50 = load i32, i32* %49, align 4, !tbaa !111 %and.i = and i32 %50, -261 %or.i = or i32 %and.i, 4 - store i32 %or.i, i32* %49, align 4, !tbaa !105 - %call.i84 = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %47, double %conv33) #7 + store i32 %or.i, i32* %49, align 4, !tbaa !111 + %call.i84 = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %47, double %conv33) #2 %51 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %51) #7 - call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %51) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 - %52 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !107 + %52 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 - %53 = load i64, i64* %_M_string_length.i, align 8, !tbaa !104 + %53 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 %call42 = call i64 @fwrite(i8* %52, i64 1, i64 %53, %struct._IO_FILE* nonnull %call35) %call43 = call i32 @fclose(%struct._IO_FILE* nonnull %call35) - %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !107 + %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 %55 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 %arraydecay.i.i.i.i = bitcast %union.anon* %55 to i8* %cmp.i.i.i = icmp eq i8* %54, %arraydecay.i.i.i.i br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i if.then.i.i: ; preds = %if.then37 - call void @_ZdlPv(i8* %54) #7 + call void @_ZdlPv(i8* %54) #2 br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %if.then37 - call void @llvm.lifetime.end(i64 32, i8* nonnull %51) #7 + call void @llvm.lifetime.end(i64 32, i8* nonnull %51) #2 %56 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 - store i64 %56, i64* %32, align 16, !tbaa !91 + store i64 %56, i64* %32, align 16, !tbaa !97 %57 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 %vtable.cast.i.i86 = inttoptr i64 %56 to i8* %vbase.offset.ptr.i.i87 = getelementptr i8, i8* %vtable.cast.i.i86, i64 -24 @@ -1675,24 +2064,24 @@ _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.th %vbase.offset.i.i88 = load i64, i64* %58, align 8 %add.ptr.i.i89 = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i88 %59 = bitcast i8* %add.ptr.i.i89 to i64* - store i64 %57, i64* %59, align 8, !tbaa !91 + store i64 %57, i64* %59, align 8, !tbaa !97 %60 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !91 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !97 %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 - %61 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !107 + %61 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 %cmp.i.i.i.i.i.i = icmp eq i8* %61, %.cast.i.i.i br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit - call void @_ZdlPv(i8* %61) #7 + call void @_ZdlPv(i8* %61) #2 br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit _ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !91 - call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #7 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 %62 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 - call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %62) #7 - call void @llvm.lifetime.end(i64 376, i8* nonnull %26) #7 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %62) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %26) #2 br label %if.end44 if.end44: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %for.cond.cleanup @@ -1700,270 +2089,2596 @@ if.end44: ; preds = %_ZNSt7__cxx1119basi } ; Function Attrs: nounwind uwtable -define void @_Z16computeAccuracy2PhiPv(i8* nocapture readonly %labels, i32 %num_labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #0 { +define float @_Z16computeAccuracy2PhiPvm(i8* nocapture readonly %labels, i32 %batch_size, i8* nocapture readonly %result_ptr, i64 %num_classes) local_unnamed_addr #3 { entry: %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 %print_str = alloca %"class.std::__cxx11::basic_string", align 8 - %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 72 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 %0 = bitcast i8* %dim_sizes to i64** - %1 = load i64*, i64** %0, align 8, !tbaa !62 - %2 = load i64, i64* %1, align 8, !tbaa !63 + %1 = load i64*, i64** %0, align 8, !tbaa !65 + %2 = load i64, i64* %1, align 8, !tbaa !66 %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 - %3 = load i64, i64* %arrayidx3, align 8, !tbaa !63 - %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 32 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !66 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 %4 = bitcast i8* %host_data to float** - %5 = load float*, float** %4, align 8, !tbaa !66 - %cmp82 = icmp eq i64 %2, 0 - br i1 %cmp82, label %for.cond.cleanup, label %for.cond4.preheader.preheader - -for.cond4.preheader.preheader: ; preds = %entry + %5 = load float*, float** %4, align 8, !tbaa !68 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.24, i64 0, i64 0), i64 %2, i64 %3) + %cmp89 = icmp eq i64 %2, 0 + br i1 %cmp89, label %for.cond.cleanup, label %for.cond4.preheader.lr.ph + +for.cond4.preheader.lr.ph: ; preds = %entry + %cmp685 = icmp ugt i64 %3, 1 + br i1 %cmp685, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader + +for.cond4.preheader.us.preheader: ; preds = %for.cond4.preheader.lr.ph + %6 = and i64 %3, 1 + %lcmp.mod = icmp eq i64 %6, 0 + %7 = icmp eq i64 %3, 2 + br label %for.cond4.preheader.us + +for.cond4.preheader.preheader: ; preds = %for.cond4.preheader.lr.ph + %min.iters.check = icmp ult i64 %2, 8 + br i1 %min.iters.check, label %for.cond4.preheader.preheader111, label %min.iters.checked + +for.cond4.preheader.preheader111: ; preds = %middle.block, %vector.scevcheck, %min.iters.checked, %for.cond4.preheader.preheader + %conv92.ph = phi i64 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %n.vec, %middle.block ] + %num_errors.091.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %45, %middle.block ] + %i.090.ph = phi i32 [ 0, %vector.scevcheck ], [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %cast.crd, %middle.block ] br label %for.cond4.preheader -for.cond4.preheader: ; preds = %for.cond4.preheader, %for.cond4.preheader.preheader - %indvars.iv = phi i64 [ %indvars.iv.next, %for.cond4.preheader ], [ 0, %for.cond4.preheader.preheader ] - %num_errors.084 = phi i32 [ %num_errors.0.inc21, %for.cond4.preheader ], [ 0, %for.cond4.preheader.preheader ] - %mul = mul i64 %indvars.iv, %3 - %arrayidx10 = getelementptr inbounds float, float* %5, i64 %mul - %6 = load float, float* %arrayidx10, align 4, !tbaa !68 - %add14 = add i64 %mul, 1 - %arrayidx15 = getelementptr inbounds float, float* %5, i64 %add14 - %7 = load float, float* %arrayidx15, align 4, !tbaa !68 - %cmp16 = fcmp fast olt float %6, %7 - %chosen.1 = zext i1 %cmp16 to i32 - %conv9.1 = zext i1 %cmp16 to i64 - %add.1 = add i64 %conv9.1, %mul - %arrayidx10.1 = getelementptr inbounds float, float* %5, i64 %add.1 - %8 = load float, float* %arrayidx10.1, align 4, !tbaa !68 - %add14.1 = add i64 %mul, 2 - %arrayidx15.1 = getelementptr inbounds float, float* %5, i64 %add14.1 - %9 = load float, float* %arrayidx15.1, align 4, !tbaa !68 - %cmp16.1 = fcmp fast olt float %8, %9 - %chosen.1.1 = select i1 %cmp16.1, i32 2, i32 %chosen.1 - %conv9.286 = zext i32 %chosen.1.1 to i64 - %add.2 = add i64 %conv9.286, %mul - %arrayidx10.2 = getelementptr inbounds float, float* %5, i64 %add.2 - %10 = load float, float* %arrayidx10.2, align 4, !tbaa !68 - %add14.2 = add i64 %mul, 3 - %arrayidx15.2 = getelementptr inbounds float, float* %5, i64 %add14.2 - %11 = load float, float* %arrayidx15.2, align 4, !tbaa !68 - %cmp16.2 = fcmp fast olt float %10, %11 - %chosen.1.2 = select i1 %cmp16.2, i32 3, i32 %chosen.1.1 - %conv9.387 = zext i32 %chosen.1.2 to i64 - %add.3 = add i64 %conv9.387, %mul - %arrayidx10.3 = getelementptr inbounds float, float* %5, i64 %add.3 - %12 = load float, float* %arrayidx10.3, align 4, !tbaa !68 - %add14.3 = add i64 %mul, 4 - %arrayidx15.3 = getelementptr inbounds float, float* %5, i64 %add14.3 - %13 = load float, float* %arrayidx15.3, align 4, !tbaa !68 - %cmp16.3 = fcmp fast olt float %12, %13 - %chosen.1.3 = select i1 %cmp16.3, i32 4, i32 %chosen.1.2 - %conv9.488 = zext i32 %chosen.1.3 to i64 - %add.4 = add i64 %conv9.488, %mul - %arrayidx10.4 = getelementptr inbounds float, float* %5, i64 %add.4 - %14 = load float, float* %arrayidx10.4, align 4, !tbaa !68 - %add14.4 = add i64 %mul, 5 - %arrayidx15.4 = getelementptr inbounds float, float* %5, i64 %add14.4 - %15 = load float, float* %arrayidx15.4, align 4, !tbaa !68 - %cmp16.4 = fcmp fast olt float %14, %15 - %chosen.1.4 = select i1 %cmp16.4, i32 5, i32 %chosen.1.3 - %conv9.589 = zext i32 %chosen.1.4 to i64 - %add.5 = add i64 %conv9.589, %mul - %arrayidx10.5 = getelementptr inbounds float, float* %5, i64 %add.5 - %16 = load float, float* %arrayidx10.5, align 4, !tbaa !68 - %add14.5 = add i64 %mul, 6 - %arrayidx15.5 = getelementptr inbounds float, float* %5, i64 %add14.5 - %17 = load float, float* %arrayidx15.5, align 4, !tbaa !68 - %cmp16.5 = fcmp fast olt float %16, %17 - %chosen.1.5 = select i1 %cmp16.5, i32 6, i32 %chosen.1.4 - %18 = zext i32 %chosen.1.5 to i64 - %add.6 = add i64 %18, %mul - %arrayidx10.6 = getelementptr inbounds float, float* %5, i64 %add.6 - %19 = load float, float* %arrayidx10.6, align 4, !tbaa !68 - %add14.6 = add i64 %mul, 7 - %arrayidx15.6 = getelementptr inbounds float, float* %5, i64 %add14.6 - %20 = load float, float* %arrayidx15.6, align 4, !tbaa !68 - %cmp16.6 = fcmp fast olt float %19, %20 - %chosen.1.6 = select i1 %cmp16.6, i32 7, i32 %chosen.1.5 - %conv9.7 = sext i32 %chosen.1.6 to i64 - %add.7 = add i64 %conv9.7, %mul - %arrayidx10.7 = getelementptr inbounds float, float* %5, i64 %add.7 - %21 = load float, float* %arrayidx10.7, align 4, !tbaa !68 - %add14.7 = add i64 %mul, 8 - %arrayidx15.7 = getelementptr inbounds float, float* %5, i64 %add14.7 - %22 = load float, float* %arrayidx15.7, align 4, !tbaa !68 - %cmp16.7 = fcmp fast olt float %21, %22 - %chosen.1.7 = select i1 %cmp16.7, i32 8, i32 %chosen.1.6 - %conv9.8 = sext i32 %chosen.1.7 to i64 - %add.8 = add i64 %conv9.8, %mul - %arrayidx10.8 = getelementptr inbounds float, float* %5, i64 %add.8 - %23 = load float, float* %arrayidx10.8, align 4, !tbaa !68 - %add14.8 = add i64 %mul, 9 - %arrayidx15.8 = getelementptr inbounds float, float* %5, i64 %add14.8 - %24 = load float, float* %arrayidx15.8, align 4, !tbaa !68 - %cmp16.8 = fcmp fast olt float %23, %24 - %chosen.1.8 = select i1 %cmp16.8, i32 9, i32 %chosen.1.7 - %arrayidx17 = getelementptr inbounds i8, i8* %labels, i64 %indvars.iv - %25 = load i8, i8* %arrayidx17, align 1, !tbaa !87 - %conv18 = zext i8 %25 to i32 - %not.cmp19 = icmp ne i32 %chosen.1.8, %conv18 - %inc21 = zext i1 %not.cmp19 to i32 - %num_errors.0.inc21 = add nsw i32 %inc21, %num_errors.084 +min.iters.checked: ; preds = %for.cond4.preheader.preheader + %n.vec = and i64 %2, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + br i1 %cmp.zero, label %for.cond4.preheader.preheader111, label %vector.scevcheck + +vector.scevcheck: ; preds = %min.iters.checked + %8 = add i64 %2, -1 + %9 = trunc i64 %8 to i32 + %10 = icmp eq i32 %9, -1 + %11 = icmp ugt i64 %8, 4294967295 + %12 = or i1 %10, %11 + %cast.crd = trunc i64 %n.vec to i32 + br i1 %12, label %for.cond4.preheader.preheader111, label %vector.body.preheader + +vector.body.preheader: ; preds = %vector.scevcheck + %13 = add i64 %n.vec, -8 + %14 = lshr exact i64 %13, 3 + %15 = and i64 %14, 1 + %lcmp.mod115 = icmp eq i64 %15, 0 + br i1 %lcmp.mod115, label %vector.body.prol.preheader, label %vector.body.prol.loopexit + +vector.body.prol.preheader: ; preds = %vector.body.preheader + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol.preheader + %16 = bitcast i8* %labels to <4 x i8>* + %wide.load.prol = load <4 x i8>, <4 x i8>* %16, align 1, !tbaa !93 + %17 = getelementptr i8, i8* %labels, i64 4 + %18 = bitcast i8* %17 to <4 x i8>* + %wide.load107.prol = load <4 x i8>, <4 x i8>* %18, align 1, !tbaa !93 + %19 = icmp ne <4 x i8> %wide.load.prol, zeroinitializer + %20 = icmp ne <4 x i8> %wide.load107.prol, zeroinitializer + %21 = zext <4 x i1> %19 to <4 x i32> + %22 = zext <4 x i1> %20 to <4 x i32> + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader + %.lcssa113.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %21, %vector.body.prol ] + %.lcssa.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %22, %vector.body.prol ] + %index.unr = phi i64 [ 0, %vector.body.preheader ], [ 8, %vector.body.prol ] + %vec.phi.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %21, %vector.body.prol ] + %vec.phi102.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %22, %vector.body.prol ] + %23 = icmp eq i64 %14, 0 + br i1 %23, label %middle.block, label %vector.body.preheader.new + +vector.body.preheader.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.body.preheader.new + %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] + %vec.phi = phi <4 x i32> [ %vec.phi.unr, %vector.body.preheader.new ], [ %42, %vector.body ] + %vec.phi102 = phi <4 x i32> [ %vec.phi102.unr, %vector.body.preheader.new ], [ %43, %vector.body ] + %24 = getelementptr inbounds i8, i8* %labels, i64 %index + %25 = bitcast i8* %24 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %25, align 1, !tbaa !93 + %26 = getelementptr i8, i8* %24, i64 4 + %27 = bitcast i8* %26 to <4 x i8>* + %wide.load107 = load <4 x i8>, <4 x i8>* %27, align 1, !tbaa !93 + %28 = icmp ne <4 x i8> %wide.load, zeroinitializer + %29 = icmp ne <4 x i8> %wide.load107, zeroinitializer + %30 = zext <4 x i1> %28 to <4 x i32> + %31 = zext <4 x i1> %29 to <4 x i32> + %32 = add nsw <4 x i32> %30, %vec.phi + %33 = add nsw <4 x i32> %31, %vec.phi102 + %index.next = add i64 %index, 8 + %34 = getelementptr inbounds i8, i8* %labels, i64 %index.next + %35 = bitcast i8* %34 to <4 x i8>* + %wide.load.1 = load <4 x i8>, <4 x i8>* %35, align 1, !tbaa !93 + %36 = getelementptr i8, i8* %34, i64 4 + %37 = bitcast i8* %36 to <4 x i8>* + %wide.load107.1 = load <4 x i8>, <4 x i8>* %37, align 1, !tbaa !93 + %38 = icmp ne <4 x i8> %wide.load.1, zeroinitializer + %39 = icmp ne <4 x i8> %wide.load107.1, zeroinitializer + %40 = zext <4 x i1> %38 to <4 x i32> + %41 = zext <4 x i1> %39 to <4 x i32> + %42 = add nsw <4 x i32> %40, %32 + %43 = add nsw <4 x i32> %41, %33 + %index.next.1 = add i64 %index, 16 + %44 = icmp eq i64 %index.next.1, %n.vec + br i1 %44, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !114 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %middle.block.unr-lcssa, %vector.body.prol.loopexit + %.lcssa113 = phi <4 x i32> [ %.lcssa113.unr, %vector.body.prol.loopexit ], [ %42, %middle.block.unr-lcssa ] + %.lcssa = phi <4 x i32> [ %.lcssa.unr, %vector.body.prol.loopexit ], [ %43, %middle.block.unr-lcssa ] + %bin.rdx = add <4 x i32> %.lcssa, %.lcssa113 + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx108 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf109 = shufflevector <4 x i32> %bin.rdx108, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx110 = add <4 x i32> %bin.rdx108, %rdx.shuf109 + %45 = extractelement <4 x i32> %bin.rdx110, i32 0 + %cmp.n = icmp eq i64 %2, %n.vec + br i1 %cmp.n, label %for.cond.cleanup, label %for.cond4.preheader.preheader111 + +for.cond4.preheader.us: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us, %for.cond4.preheader.us.preheader + %conv92.us = phi i64 [ %conv.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %num_errors.091.us = phi i32 [ %num_errors.0.inc22.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %i.090.us = phi i32 [ %inc25.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %mul.us = mul i64 %conv92.us, %3 + br i1 %lcmp.mod, label %for.body8.us.prol.preheader, label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.preheader: ; preds = %for.cond4.preheader.us + br label %for.body8.us.prol + +for.body8.us.prol: ; preds = %for.body8.us.prol.preheader + %arrayidx11.us.prol = getelementptr inbounds float, float* %5, i64 %mul.us + %46 = load float, float* %arrayidx11.us.prol, align 4, !tbaa !71 + %add15.us.prol = add i64 %mul.us, 1 + %arrayidx16.us.prol = getelementptr inbounds float, float* %5, i64 %add15.us.prol + %47 = load float, float* %arrayidx16.us.prol, align 4, !tbaa !71 + %cmp17.us.prol = fcmp fast olt float %46, %47 + %chosen.1.us.prol = zext i1 %cmp17.us.prol to i32 + br label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.loopexit.unr-lcssa: ; preds = %for.body8.us.prol, %for.cond4.preheader.us + %chosen.1.us.lcssa.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ undef, %for.cond4.preheader.us ] + %indvars.iv.unr.ph = phi i64 [ 2, %for.body8.us.prol ], [ 1, %for.cond4.preheader.us ] + %chosen.086.us.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ 0, %for.cond4.preheader.us ] + br label %for.body8.us.prol.loopexit + +for.body8.us.prol.loopexit: ; preds = %for.body8.us.prol.loopexit.unr-lcssa + br i1 %7, label %for.cond4.for.cond.cleanup7_crit_edge.us, label %for.cond4.preheader.us.new + +for.cond4.preheader.us.new: ; preds = %for.body8.us.prol.loopexit + br label %for.body8.us + +for.body8.us: ; preds = %for.body8.us, %for.cond4.preheader.us.new + %indvars.iv = phi i64 [ %indvars.iv.unr.ph, %for.cond4.preheader.us.new ], [ %indvars.iv.next.1, %for.body8.us ] + %chosen.086.us = phi i32 [ %chosen.086.us.unr.ph, %for.cond4.preheader.us.new ], [ %chosen.1.us.1, %for.body8.us ] + %conv10.us = sext i32 %chosen.086.us to i64 + %add.us = add i64 %conv10.us, %mul.us + %arrayidx11.us = getelementptr inbounds float, float* %5, i64 %add.us + %48 = load float, float* %arrayidx11.us, align 4, !tbaa !71 + %add15.us = add i64 %indvars.iv, %mul.us + %arrayidx16.us = getelementptr inbounds float, float* %5, i64 %add15.us + %49 = load float, float* %arrayidx16.us, align 4, !tbaa !71 + %cmp17.us = fcmp fast olt float %48, %49 + %50 = trunc i64 %indvars.iv to i32 + %chosen.1.us = select i1 %cmp17.us, i32 %50, i32 %chosen.086.us %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond = icmp eq i64 %indvars.iv.next, %2 - br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.cond4.preheader + %conv10.us.1 = sext i32 %chosen.1.us to i64 + %add.us.1 = add i64 %conv10.us.1, %mul.us + %arrayidx11.us.1 = getelementptr inbounds float, float* %5, i64 %add.us.1 + %51 = load float, float* %arrayidx11.us.1, align 4, !tbaa !71 + %add15.us.1 = add i64 %indvars.iv.next, %mul.us + %arrayidx16.us.1 = getelementptr inbounds float, float* %5, i64 %add15.us.1 + %52 = load float, float* %arrayidx16.us.1, align 4, !tbaa !71 + %cmp17.us.1 = fcmp fast olt float %51, %52 + %53 = trunc i64 %indvars.iv.next to i32 + %chosen.1.us.1 = select i1 %cmp17.us.1, i32 %53, i32 %chosen.1.us + %indvars.iv.next.1 = add nsw i64 %indvars.iv, 2 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %3 + br i1 %exitcond.1, label %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa, label %for.body8.us + +for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa: ; preds = %for.body8.us + br label %for.cond4.for.cond.cleanup7_crit_edge.us + +for.cond4.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa, %for.body8.us.prol.loopexit + %chosen.1.us.lcssa = phi i32 [ %chosen.1.us.lcssa.unr.ph, %for.body8.us.prol.loopexit ], [ %chosen.1.us.1, %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa ] + %arrayidx18.us = getelementptr inbounds i8, i8* %labels, i64 %conv92.us + %54 = load i8, i8* %arrayidx18.us, align 1, !tbaa !93 + %conv19.us = zext i8 %54 to i32 + %not.cmp20.us = icmp ne i32 %chosen.1.us.lcssa, %conv19.us + %inc22.us = zext i1 %not.cmp20.us to i32 + %num_errors.0.inc22.us = add nsw i32 %inc22.us, %num_errors.091.us + %inc25.us = add i32 %i.090.us, 1 + %conv.us = zext i32 %inc25.us to i64 + %cmp.us = icmp ult i64 %conv.us, %2 + br i1 %cmp.us, label %for.cond4.preheader.us, label %for.cond.cleanup.loopexit + +for.cond4.preheader: ; preds = %for.cond4.preheader, %for.cond4.preheader.preheader111 + %conv92 = phi i64 [ %conv, %for.cond4.preheader ], [ %conv92.ph, %for.cond4.preheader.preheader111 ] + %num_errors.091 = phi i32 [ %num_errors.0.inc22, %for.cond4.preheader ], [ %num_errors.091.ph, %for.cond4.preheader.preheader111 ] + %i.090 = phi i32 [ %inc25, %for.cond4.preheader ], [ %i.090.ph, %for.cond4.preheader.preheader111 ] + %arrayidx18 = getelementptr inbounds i8, i8* %labels, i64 %conv92 + %55 = load i8, i8* %arrayidx18, align 1, !tbaa !93 + %not.cmp20 = icmp ne i8 %55, 0 + %inc22 = zext i1 %not.cmp20 to i32 + %num_errors.0.inc22 = add nsw i32 %inc22, %num_errors.091 + %inc25 = add i32 %i.090, 1 + %conv = zext i32 %inc25 to i64 + %cmp = icmp ult i64 %conv, %2 + br i1 %cmp, label %for.cond4.preheader, label %for.cond.cleanup.loopexit112, !llvm.loop !115 + +for.cond.cleanup.loopexit: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us + br label %for.cond.cleanup -for.cond.cleanup.loopexit: ; preds = %for.cond4.preheader - %phitmp = sext i32 %num_errors.0.inc21 to i64 +for.cond.cleanup.loopexit112: ; preds = %for.cond4.preheader br label %for.cond.cleanup -for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry - %num_errors.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] - %sub = sub i64 %2, %num_errors.0.lcssa +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit112, %for.cond.cleanup.loopexit, %middle.block, %entry + %num_errors.0.lcssa = phi i32 [ 0, %entry ], [ %45, %middle.block ], [ %num_errors.0.inc22.us, %for.cond.cleanup.loopexit ], [ %num_errors.0.inc22, %for.cond.cleanup.loopexit112 ] + %conv27 = sext i32 %num_errors.0.lcssa to i64 + %sub = sub i64 %2, %conv27 + %conv28 = uitofp i64 %sub to double + %conv30 = uitofp i64 %2 to double + %div = fdiv fast double %conv28, %conv30 + %mul32 = fmul fast double %div, 1.000000e+02 + %conv33 = fptrunc double %mul32 to float + %conv34 = fpext float %conv33 to double + %call35 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv34) + %call36 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp37 = icmp eq %struct._IO_FILE* %call36, null + br i1 %cmp37, label %if.end44, label %if.then38 + +if.then38: ; preds = %for.cond.cleanup + %56 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %56) #2 + %57 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %58 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %57, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %58) #2 + %59 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %57, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %59, align 16, !tbaa !97 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %60 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %60, i8 0, i64 32, i32 8, i1 false) #2 + %61 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %62 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %61, i64* %62, align 16, !tbaa !97 + %63 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %64 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %61 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %65 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %65, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i.i + %66 = bitcast i8* %add.ptr.i.i to i64* + store i64 %63, i64* %66, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %64, align 16, !tbaa !97 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %67 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %67, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset5.i.i + %68 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %68, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %59, align 16, !tbaa !97 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %69 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %70 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %70, align 16, !tbaa !97 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %71 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %71, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %69, align 8, !tbaa !97 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %72 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %73 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %72, %union.anon** %73, align 8, !tbaa !109 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + %.cast.i.i.i = bitcast %union.anon* %72 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %64, align 16, !tbaa !97 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %74 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %74, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i + %75 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %76 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %75, %"class.std::basic_streambuf"* %76) #2 + %77 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i77 = load i8*, i8** %64, align 16, !tbaa !97 + %vbase.offset.ptr.i78 = getelementptr i8, i8* %vtable.i77, i64 -24 + %78 = bitcast i8* %vbase.offset.ptr.i78 to i64* + %vbase.offset.i79 = load i64, i64* %78, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i79 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %79 = bitcast i8* %_M_flags.i.i to i32* + %80 = load i32, i32* %79, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %80, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %79, align 4, !tbaa !111 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %77, double %conv34) #2 + %81 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %81) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %82 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %83 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call43 = call i64 @fwrite(i8* %82, i64 1, i64 %83, %struct._IO_FILE* nonnull %call36) + %84 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %85 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %85 to i8* + %cmp.i.i.i = icmp eq i8* %84, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then38 + call void @_ZdlPv(i8* %84) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %if.then38 + call void @llvm.lifetime.end(i64 32, i8* nonnull %81) #2 + %86 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %86, i64* %62, align 16, !tbaa !97 + %87 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i81 = inttoptr i64 %86 to i8* + %vbase.offset.ptr.i.i82 = getelementptr i8, i8* %vtable.cast.i.i81, i64 -24 + %88 = bitcast i8* %vbase.offset.ptr.i.i82 to i64* + %vbase.offset.i.i83 = load i64, i64* %88, align 8 + %add.ptr.i.i84 = getelementptr inbounds i8, i8* %56, i64 %vbase.offset.i.i83 + %89 = bitcast i8* %add.ptr.i.i84 to i64* + store i64 %87, i64* %89, align 8, !tbaa !97 + %90 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %90, align 8, !tbaa !97 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %91 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %91, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %91) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %90, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %92 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %92) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %56) #2 + br label %if.end44 + +if.end44: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %for.cond.cleanup + %call45 = call i32 @fclose(%struct._IO_FILE* %call36) + ret float %conv33 +} + +; Function Attrs: nounwind uwtable +define float @_Z16computeAccuracy3PjPv(i32* nocapture readonly %labels, i8* nocapture readonly %result_ptr) local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 + %0 = bitcast i8* %dim_sizes to i64** + %1 = load i64*, i64** %0, align 8, !tbaa !65 + %2 = load i64, i64* %1, align 8, !tbaa !66 + %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !66 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 + %4 = bitcast i8* %host_data to float** + %5 = load float*, float** %4, align 8, !tbaa !68 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.25, i64 0, i64 0), i64 %2, i64 %3) + %cmp89 = icmp eq i64 %2, 0 + br i1 %cmp89, label %for.cond.cleanup, label %for.cond4.preheader.lr.ph + +for.cond4.preheader.lr.ph: ; preds = %entry + %cmp685 = icmp ugt i64 %3, 1 + br i1 %cmp685, label %for.cond4.preheader.us.preheader, label %for.cond4.preheader.preheader + +for.cond4.preheader.us.preheader: ; preds = %for.cond4.preheader.lr.ph + %6 = and i64 %3, 1 + %lcmp.mod = icmp eq i64 %6, 0 + %7 = icmp eq i64 %3, 2 + br label %for.cond4.preheader.us + +for.cond4.preheader.preheader: ; preds = %for.cond4.preheader.lr.ph + %min.iters.check = icmp ult i64 %2, 8 + br i1 %min.iters.check, label %for.cond4.preheader.preheader109, label %min.iters.checked + +for.cond4.preheader.preheader109: ; preds = %middle.block, %min.iters.checked, %for.cond4.preheader.preheader + %indvars.iv98.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %n.vec, %middle.block ] + %num_errors.091.ph = phi i32 [ 0, %min.iters.checked ], [ 0, %for.cond4.preheader.preheader ], [ %40, %middle.block ] + br label %for.cond4.preheader + +min.iters.checked: ; preds = %for.cond4.preheader.preheader + %n.vec = and i64 %2, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + br i1 %cmp.zero, label %for.cond4.preheader.preheader109, label %vector.body.preheader + +vector.body.preheader: ; preds = %min.iters.checked + %8 = add i64 %n.vec, -8 + %9 = lshr exact i64 %8, 3 + %10 = and i64 %9, 1 + %lcmp.mod113 = icmp eq i64 %10, 0 + br i1 %lcmp.mod113, label %vector.body.prol.preheader, label %vector.body.prol.loopexit + +vector.body.prol.preheader: ; preds = %vector.body.preheader + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol.preheader + %11 = bitcast i32* %labels to <4 x i32>* + %wide.load.prol = load <4 x i32>, <4 x i32>* %11, align 4, !tbaa !121 + %12 = getelementptr i32, i32* %labels, i64 4 + %13 = bitcast i32* %12 to <4 x i32>* + %wide.load105.prol = load <4 x i32>, <4 x i32>* %13, align 4, !tbaa !121 + %14 = icmp ne <4 x i32> %wide.load.prol, zeroinitializer + %15 = icmp ne <4 x i32> %wide.load105.prol, zeroinitializer + %16 = zext <4 x i1> %14 to <4 x i32> + %17 = zext <4 x i1> %15 to <4 x i32> + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.body.prol, %vector.body.preheader + %.lcssa111.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %16, %vector.body.prol ] + %.lcssa.unr = phi <4 x i32> [ undef, %vector.body.preheader ], [ %17, %vector.body.prol ] + %index.unr = phi i64 [ 0, %vector.body.preheader ], [ 8, %vector.body.prol ] + %vec.phi.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %16, %vector.body.prol ] + %vec.phi104.unr = phi <4 x i32> [ zeroinitializer, %vector.body.preheader ], [ %17, %vector.body.prol ] + %18 = icmp eq i64 %9, 0 + br i1 %18, label %middle.block, label %vector.body.preheader.new + +vector.body.preheader.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.body.preheader.new + %index = phi i64 [ %index.unr, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] + %vec.phi = phi <4 x i32> [ %vec.phi.unr, %vector.body.preheader.new ], [ %37, %vector.body ] + %vec.phi104 = phi <4 x i32> [ %vec.phi104.unr, %vector.body.preheader.new ], [ %38, %vector.body ] + %19 = getelementptr inbounds i32, i32* %labels, i64 %index + %20 = bitcast i32* %19 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %20, align 4, !tbaa !121 + %21 = getelementptr i32, i32* %19, i64 4 + %22 = bitcast i32* %21 to <4 x i32>* + %wide.load105 = load <4 x i32>, <4 x i32>* %22, align 4, !tbaa !121 + %23 = icmp ne <4 x i32> %wide.load, zeroinitializer + %24 = icmp ne <4 x i32> %wide.load105, zeroinitializer + %25 = zext <4 x i1> %23 to <4 x i32> + %26 = zext <4 x i1> %24 to <4 x i32> + %27 = add nsw <4 x i32> %25, %vec.phi + %28 = add nsw <4 x i32> %26, %vec.phi104 + %index.next = add i64 %index, 8 + %29 = getelementptr inbounds i32, i32* %labels, i64 %index.next + %30 = bitcast i32* %29 to <4 x i32>* + %wide.load.1 = load <4 x i32>, <4 x i32>* %30, align 4, !tbaa !121 + %31 = getelementptr i32, i32* %29, i64 4 + %32 = bitcast i32* %31 to <4 x i32>* + %wide.load105.1 = load <4 x i32>, <4 x i32>* %32, align 4, !tbaa !121 + %33 = icmp ne <4 x i32> %wide.load.1, zeroinitializer + %34 = icmp ne <4 x i32> %wide.load105.1, zeroinitializer + %35 = zext <4 x i1> %33 to <4 x i32> + %36 = zext <4 x i1> %34 to <4 x i32> + %37 = add nsw <4 x i32> %35, %27 + %38 = add nsw <4 x i32> %36, %28 + %index.next.1 = add i64 %index, 16 + %39 = icmp eq i64 %index.next.1, %n.vec + br i1 %39, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !122 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %middle.block.unr-lcssa, %vector.body.prol.loopexit + %.lcssa111 = phi <4 x i32> [ %.lcssa111.unr, %vector.body.prol.loopexit ], [ %37, %middle.block.unr-lcssa ] + %.lcssa = phi <4 x i32> [ %.lcssa.unr, %vector.body.prol.loopexit ], [ %38, %middle.block.unr-lcssa ] + %bin.rdx = add <4 x i32> %.lcssa, %.lcssa111 + %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx106 = add <4 x i32> %bin.rdx, %rdx.shuf + %rdx.shuf107 = shufflevector <4 x i32> %bin.rdx106, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx108 = add <4 x i32> %bin.rdx106, %rdx.shuf107 + %40 = extractelement <4 x i32> %bin.rdx108, i32 0 + %cmp.n = icmp eq i64 %2, %n.vec + br i1 %cmp.n, label %for.cond.cleanup, label %for.cond4.preheader.preheader109 + +for.cond4.preheader.us: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us, %for.cond4.preheader.us.preheader + %indvars.iv95 = phi i64 [ %indvars.iv.next96, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %num_errors.091.us = phi i32 [ %num_errors.0.inc21.us, %for.cond4.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond4.preheader.us.preheader ] + %mul.us = mul i64 %indvars.iv95, %3 + br i1 %lcmp.mod, label %for.body8.us.prol.preheader, label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.preheader: ; preds = %for.cond4.preheader.us + br label %for.body8.us.prol + +for.body8.us.prol: ; preds = %for.body8.us.prol.preheader + %arrayidx11.us.prol = getelementptr inbounds float, float* %5, i64 %mul.us + %41 = load float, float* %arrayidx11.us.prol, align 4, !tbaa !71 + %add15.us.prol = add i64 %mul.us, 1 + %arrayidx16.us.prol = getelementptr inbounds float, float* %5, i64 %add15.us.prol + %42 = load float, float* %arrayidx16.us.prol, align 4, !tbaa !71 + %cmp17.us.prol = fcmp fast olt float %41, %42 + %chosen.1.us.prol = zext i1 %cmp17.us.prol to i32 + br label %for.body8.us.prol.loopexit.unr-lcssa + +for.body8.us.prol.loopexit.unr-lcssa: ; preds = %for.body8.us.prol, %for.cond4.preheader.us + %chosen.1.us.lcssa.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ undef, %for.cond4.preheader.us ] + %indvars.iv.unr.ph = phi i64 [ 2, %for.body8.us.prol ], [ 1, %for.cond4.preheader.us ] + %chosen.086.us.unr.ph = phi i32 [ %chosen.1.us.prol, %for.body8.us.prol ], [ 0, %for.cond4.preheader.us ] + br label %for.body8.us.prol.loopexit + +for.body8.us.prol.loopexit: ; preds = %for.body8.us.prol.loopexit.unr-lcssa + br i1 %7, label %for.cond4.for.cond.cleanup7_crit_edge.us, label %for.cond4.preheader.us.new + +for.cond4.preheader.us.new: ; preds = %for.body8.us.prol.loopexit + br label %for.body8.us + +for.body8.us: ; preds = %for.body8.us, %for.cond4.preheader.us.new + %indvars.iv = phi i64 [ %indvars.iv.unr.ph, %for.cond4.preheader.us.new ], [ %indvars.iv.next.1, %for.body8.us ] + %chosen.086.us = phi i32 [ %chosen.086.us.unr.ph, %for.cond4.preheader.us.new ], [ %chosen.1.us.1, %for.body8.us ] + %conv10.us = sext i32 %chosen.086.us to i64 + %add.us = add i64 %conv10.us, %mul.us + %arrayidx11.us = getelementptr inbounds float, float* %5, i64 %add.us + %43 = load float, float* %arrayidx11.us, align 4, !tbaa !71 + %add15.us = add i64 %indvars.iv, %mul.us + %arrayidx16.us = getelementptr inbounds float, float* %5, i64 %add15.us + %44 = load float, float* %arrayidx16.us, align 4, !tbaa !71 + %cmp17.us = fcmp fast olt float %43, %44 + %45 = trunc i64 %indvars.iv to i32 + %chosen.1.us = select i1 %cmp17.us, i32 %45, i32 %chosen.086.us + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %conv10.us.1 = sext i32 %chosen.1.us to i64 + %add.us.1 = add i64 %conv10.us.1, %mul.us + %arrayidx11.us.1 = getelementptr inbounds float, float* %5, i64 %add.us.1 + %46 = load float, float* %arrayidx11.us.1, align 4, !tbaa !71 + %add15.us.1 = add i64 %indvars.iv.next, %mul.us + %arrayidx16.us.1 = getelementptr inbounds float, float* %5, i64 %add15.us.1 + %47 = load float, float* %arrayidx16.us.1, align 4, !tbaa !71 + %cmp17.us.1 = fcmp fast olt float %46, %47 + %48 = trunc i64 %indvars.iv.next to i32 + %chosen.1.us.1 = select i1 %cmp17.us.1, i32 %48, i32 %chosen.1.us + %indvars.iv.next.1 = add nsw i64 %indvars.iv, 2 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, %3 + br i1 %exitcond.1, label %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa, label %for.body8.us + +for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa: ; preds = %for.body8.us + br label %for.cond4.for.cond.cleanup7_crit_edge.us + +for.cond4.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa, %for.body8.us.prol.loopexit + %chosen.1.us.lcssa = phi i32 [ %chosen.1.us.lcssa.unr.ph, %for.body8.us.prol.loopexit ], [ %chosen.1.us.1, %for.cond4.for.cond.cleanup7_crit_edge.us.unr-lcssa ] + %arrayidx18.us = getelementptr inbounds i32, i32* %labels, i64 %indvars.iv95 + %49 = load i32, i32* %arrayidx18.us, align 4, !tbaa !121 + %not.cmp19.us = icmp ne i32 %chosen.1.us.lcssa, %49 + %inc21.us = zext i1 %not.cmp19.us to i32 + %num_errors.0.inc21.us = add nsw i32 %inc21.us, %num_errors.091.us + %indvars.iv.next96 = add nuw nsw i64 %indvars.iv95, 1 + %exitcond97 = icmp eq i64 %indvars.iv.next96, %2 + br i1 %exitcond97, label %for.cond.cleanup.loopexit, label %for.cond4.preheader.us + +for.cond4.preheader: ; preds = %for.cond4.preheader, %for.cond4.preheader.preheader109 + %indvars.iv98 = phi i64 [ %indvars.iv.next99, %for.cond4.preheader ], [ %indvars.iv98.ph, %for.cond4.preheader.preheader109 ] + %num_errors.091 = phi i32 [ %num_errors.0.inc21, %for.cond4.preheader ], [ %num_errors.091.ph, %for.cond4.preheader.preheader109 ] + %arrayidx18 = getelementptr inbounds i32, i32* %labels, i64 %indvars.iv98 + %50 = load i32, i32* %arrayidx18, align 4, !tbaa !121 + %not.cmp19 = icmp ne i32 %50, 0 + %inc21 = zext i1 %not.cmp19 to i32 + %num_errors.0.inc21 = add nsw i32 %inc21, %num_errors.091 + %indvars.iv.next99 = add nuw nsw i64 %indvars.iv98, 1 + %exitcond100 = icmp eq i64 %indvars.iv.next99, %2 + br i1 %exitcond100, label %for.cond.cleanup.loopexit110, label %for.cond4.preheader, !llvm.loop !123 + +for.cond.cleanup.loopexit: ; preds = %for.cond4.for.cond.cleanup7_crit_edge.us + br label %for.cond.cleanup + +for.cond.cleanup.loopexit110: ; preds = %for.cond4.preheader + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit110, %for.cond.cleanup.loopexit, %middle.block, %entry + %num_errors.0.lcssa = phi i32 [ 0, %entry ], [ %40, %middle.block ], [ %num_errors.0.inc21.us, %for.cond.cleanup.loopexit ], [ %num_errors.0.inc21, %for.cond.cleanup.loopexit110 ] + %conv26 = sext i32 %num_errors.0.lcssa to i64 + %sub = sub i64 %2, %conv26 %conv27 = uitofp i64 %sub to double %conv29 = uitofp i64 %2 to double %div = fdiv fast double %conv27, %conv29 %mul31 = fmul fast double %div, 1.000000e+02 %conv32 = fptrunc double %mul31 to float %conv33 = fpext float %conv32 to double - %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.20, i64 0, i64 0), double %conv33) - %call34 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.21, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.22, i64 0, i64 0)) - %cmp35 = icmp eq %struct._IO_FILE* %call34, null - br i1 %cmp35, label %if.end43, label %if.then36 + %call34 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv33) + %call35 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp36 = icmp eq %struct._IO_FILE* %call35, null + br i1 %cmp36, label %if.end43, label %if.then37 -if.then36: ; preds = %for.cond.cleanup - %26 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* - call void @llvm.lifetime.start(i64 376, i8* nonnull %26) #7 - %27 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 - %28 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0 - call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %28) #7 - %29 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %27, i64 0, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !91 +if.then37: ; preds = %for.cond.cleanup + %51 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %51) #2 + %52 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %53 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %52, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %53) #2 + %54 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %52, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %54, align 16, !tbaa !97 %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 - store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !93 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 - store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !96 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 - store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !97 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 - %30 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %30, i8 0, i64 32, i32 8, i1 false) #7 - %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 - %32 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* - store i64 %31, i64* %32, align 16, !tbaa !91 - %33 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 - %34 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** - %vtable.cast.i.i = inttoptr i64 %31 to i8* + %55 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %55, i8 0, i64 32, i32 8, i1 false) #2 + %56 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %57 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %56, i64* %57, align 16, !tbaa !97 + %58 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %59 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %56 to i8* %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 - %35 = bitcast i8* %vbase.offset.ptr.i.i to i64* - %vbase.offset.i.i = load i64, i64* %35, align 8 - %add.ptr.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i - %36 = bitcast i8* %add.ptr.i.i to i64* - store i64 %33, i64* %36, align 8, !tbaa !91 - %vtable3.i.i = load i8*, i8** %34, align 16, !tbaa !91 + %60 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %60, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i.i + %61 = bitcast i8* %add.ptr.i.i to i64* + store i64 %58, i64* %61, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %59, align 16, !tbaa !97 %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 - %37 = bitcast i8* %vbase.offset.ptr4.i.i to i64* - %vbase.offset5.i.i = load i64, i64* %37, align 8 - %add.ptr6.i.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset5.i.i - %38 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %38, %"class.std::basic_streambuf"* null) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %29, align 16, !tbaa !91 + %62 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %62, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset5.i.i + %63 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %63, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %54, align 16, !tbaa !97 %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 - %39 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 - %40 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* - store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %40, align 16, !tbaa !91 + %64 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %65 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %65, align 16, !tbaa !97 %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 - %41 = bitcast i8** %_M_in_beg.i.i.i to i8* - call void @llvm.memset.p0i8.i64(i8* %41, i8 0, i64 48, i32 8, i1 false) #7 - call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #7 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %39, align 8, !tbaa !91 + %66 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %66, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %64, align 8, !tbaa !97 %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 - store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !98 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 - %42 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 - %43 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** - store %union.anon* %42, %union.anon** %43, align 8, !tbaa !103 + %67 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %68 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %67, %union.anon** %68, align 8, !tbaa !109 %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 - store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !104 - %.cast.i.i.i = bitcast %union.anon* %42 to i8* - store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !87 - %vtable.i = load i8*, i8** %34, align 16, !tbaa !91 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + %.cast.i.i.i = bitcast %union.anon* %67 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %59, align 16, !tbaa !97 %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 - %44 = bitcast i8* %vbase.offset.ptr.i to i64* - %vbase.offset.i = load i64, i64* %44, align 8 - %add.ptr2.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i - %45 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* - %46 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 - call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %45, %"class.std::basic_streambuf"* %46) #7 - %47 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* - %vtable.i72 = load i8*, i8** %34, align 16, !tbaa !91 - %vbase.offset.ptr.i73 = getelementptr i8, i8* %vtable.i72, i64 -24 - %48 = bitcast i8* %vbase.offset.ptr.i73 to i64* - %vbase.offset.i74 = load i64, i64* %48, align 8 - %add.ptr.i = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i74 + %69 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %69, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i + %70 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %71 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %70, %"class.std::basic_streambuf"* %71) #2 + %72 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i77 = load i8*, i8** %59, align 16, !tbaa !97 + %vbase.offset.ptr.i78 = getelementptr i8, i8* %vtable.i77, i64 -24 + %73 = bitcast i8* %vbase.offset.ptr.i78 to i64* + %vbase.offset.i79 = load i64, i64* %73, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i79 %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 - %49 = bitcast i8* %_M_flags.i.i to i32* - %50 = load i32, i32* %49, align 8, !tbaa !108 - %and.i.i.i.i = and i32 %50, -261 + %74 = bitcast i8* %_M_flags.i.i to i32* + %75 = load i32, i32* %74, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %75, -261 %or.i.i.i.i = or i32 %and.i.i.i.i, 4 - store i32 %or.i.i.i.i, i32* %49, align 4, !tbaa !105 - %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %47, double %conv33) #7 - %51 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %51) #7 - call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #7 + store i32 %or.i.i.i.i, i32* %74, align 4, !tbaa !111 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %72, double %conv33) #2 + %76 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %76) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 - %52 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !107 + %77 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 - %53 = load i64, i64* %_M_string_length.i, align 8, !tbaa !104 - %call41 = call i64 @fwrite(i8* %52, i64 1, i64 %53, %struct._IO_FILE* nonnull %call34) - %call42 = call i32 @fclose(%struct._IO_FILE* nonnull %call34) - %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !107 - %55 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 - %arraydecay.i.i.i.i = bitcast %union.anon* %55 to i8* - %cmp.i.i.i = icmp eq i8* %54, %arraydecay.i.i.i.i + %78 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call42 = call i64 @fwrite(i8* %77, i64 1, i64 %78, %struct._IO_FILE* nonnull %call35) + %79 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %80 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %80 to i8* + %cmp.i.i.i = icmp eq i8* %79, %arraydecay.i.i.i.i br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i -if.then.i.i: ; preds = %if.then36 - call void @_ZdlPv(i8* %54) #7 +if.then.i.i: ; preds = %if.then37 + call void @_ZdlPv(i8* %79) #2 br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %if.then36 - call void @llvm.lifetime.end(i64 32, i8* nonnull %51) #7 - %56 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 - store i64 %56, i64* %32, align 16, !tbaa !91 - %57 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 - %vtable.cast.i.i76 = inttoptr i64 %56 to i8* - %vbase.offset.ptr.i.i77 = getelementptr i8, i8* %vtable.cast.i.i76, i64 -24 - %58 = bitcast i8* %vbase.offset.ptr.i.i77 to i64* - %vbase.offset.i.i78 = load i64, i64* %58, align 8 - %add.ptr.i.i79 = getelementptr inbounds i8, i8* %26, i64 %vbase.offset.i.i78 - %59 = bitcast i8* %add.ptr.i.i79 to i64* - store i64 %57, i64* %59, align 8, !tbaa !91 - %60 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !91 +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %if.then37 + call void @llvm.lifetime.end(i64 32, i8* nonnull %76) #2 + %81 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %81, i64* %57, align 16, !tbaa !97 + %82 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i81 = inttoptr i64 %81 to i8* + %vbase.offset.ptr.i.i82 = getelementptr i8, i8* %vtable.cast.i.i81, i64 -24 + %83 = bitcast i8* %vbase.offset.ptr.i.i82 to i64* + %vbase.offset.i.i83 = load i64, i64* %83, align 8 + %add.ptr.i.i84 = getelementptr inbounds i8, i8* %51, i64 %vbase.offset.i.i83 + %84 = bitcast i8* %add.ptr.i.i84 to i64* + store i64 %82, i64* %84, align 8, !tbaa !97 + %85 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %85, align 8, !tbaa !97 %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 - %61 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !107 - %cmp.i.i.i.i.i.i = icmp eq i8* %61, %.cast.i.i.i + %86 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %86, %.cast.i.i.i br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit - call void @_ZdlPv(i8* %61) #7 + call void @_ZdlPv(i8* %86) #2 br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit _ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit - store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %60, align 8, !tbaa !91 - call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #7 - %62 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 - call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %62) #7 - call void @llvm.lifetime.end(i64 376, i8* nonnull %26) #7 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %85, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %87 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %87) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %51) #2 br label %if.end43 if.end43: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %for.cond.cleanup + %call44 = call i32 @fclose(%struct._IO_FILE* %call35) + ret float %conv32 +} + +; Function Attrs: norecurse nounwind readnone uwtable +define zeroext i1 @_Z16descendFloatComp9ClassProbS_(i64 %obj1.coerce, i64 %obj2.coerce) #7 { +entry: + %obj1.sroa.0.0.extract.trunc = trunc i64 %obj1.coerce to i32 + %0 = bitcast i32 %obj1.sroa.0.0.extract.trunc to float + %obj2.sroa.0.0.extract.trunc = trunc i64 %obj2.coerce to i32 + %1 = bitcast i32 %obj2.sroa.0.0.extract.trunc to float + %cmp = fcmp fast ogt float %0, %1 + ret i1 %cmp +} + +; Function Attrs: nounwind uwtable +define float @_Z19computeTop5AccuracyPhiPvj(i8* nocapture readonly %labels, i32 %num_labels, i8* nocapture readonly %result_ptr, i32 %num_classes) local_unnamed_addr #3 { +entry: + %elem_probs.sroa.9 = alloca i64, align 8 + %elem_probs.sroa.15 = alloca %struct.ClassProb*, align 8 + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %dim_sizes = getelementptr inbounds i8, i8* %result_ptr, i64 96 + %0 = bitcast i8* %dim_sizes to i64** + %1 = load i64*, i64** %0, align 8, !tbaa !65 + %2 = load i64, i64* %1, align 8, !tbaa !66 + %arrayidx3 = getelementptr inbounds i64, i64* %1, i64 1 + %3 = load i64, i64* %arrayidx3, align 8, !tbaa !66 + %host_data = getelementptr inbounds i8, i8* %result_ptr, i64 48 + %4 = bitcast i8* %host_data to float** + %5 = load float*, float** %4, align 8, !tbaa !68 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.24, i64 0, i64 0), i64 %2, i64 %3) + %cmp162 = icmp sgt i32 %num_labels, 0 + br i1 %cmp162, label %for.body.lr.ph, label %for.cond.cleanup + +for.body.lr.ph: ; preds = %entry + %elem_probs.sroa.9.0..sroa_cast151 = bitcast i64* %elem_probs.sroa.9 to i8* + %elem_probs.sroa.15.0..sroa_cast149 = bitcast %struct.ClassProb** %elem_probs.sroa.15 to i8* + %cmp5156 = icmp eq i32 %num_classes, 0 + %elem_probs.sroa.9.0._M_finish.i110.sroa_cast = bitcast i64* %elem_probs.sroa.9 to %struct.ClassProb** + %6 = zext i32 %num_classes to i64 + %7 = sext i32 %num_labels to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit + %phitmp = sext i32 %add31.num_errors.0 to i64 + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + %num_errors.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ] + %sub = sub i64 %2, %num_errors.0.lcssa + %conv37 = uitofp i64 %sub to double + %conv39 = uitofp i64 %2 to double + %div = fdiv fast double %conv37, %conv39 + %mul41 = fmul fast double %div, 1.000000e+02 + %conv42 = fptrunc double %mul41 to float + %conv43 = fpext float %conv42 to double + %call44 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str.21, i64 0, i64 0), double %conv43) + %call45 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp46 = icmp eq %struct._IO_FILE* %call45, null + br i1 %cmp46, label %if.end53, label %if.then47 + +for.body: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit, %for.body.lr.ph + %indvars.iv169 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next170, %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit ] + %num_errors.0164 = phi i32 [ 0, %for.body.lr.ph ], [ %add31.num_errors.0, %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit ] + call void @llvm.lifetime.start(i64 8, i8* nonnull %elem_probs.sroa.9.0..sroa_cast151) + call void @llvm.lifetime.start(i64 8, i8* nonnull %elem_probs.sroa.15.0..sroa_cast149) + store i64 0, i64* %elem_probs.sroa.9, align 8 + store %struct.ClassProb* null, %struct.ClassProb** %elem_probs.sroa.15, align 8 + br i1 %cmp5156, label %for.cond.cleanup6, label %for.body7.lr.ph + +for.body7.lr.ph: ; preds = %for.body + %mul = mul i64 %indvars.iv169, %3 + br label %for.body7 + +for.cond.cleanup6.loopexit: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + br label %for.cond.cleanup6 + +for.cond.cleanup6: ; preds = %for.cond.cleanup6.loopexit, %for.body + %elem_probs.sroa.9.0.elem_probs.sroa.9.8.146 = phi i64 [ 0, %for.body ], [ %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre, %for.cond.cleanup6.loopexit ] + %elem_probs.sroa.0.0.lcssa = phi i64 [ 0, %for.body ], [ %elem_probs.sroa.0.1, %for.cond.cleanup6.loopexit ] + %8 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to %struct.ClassProb* + %9 = inttoptr i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8.146 to %struct.ClassProb* + %cmp.i.i.i = icmp eq %struct.ClassProb* %8, %9 + br i1 %cmp.i.i.i, label %for.cond16.preheader, label %if.then.i.i + +for.cond16.preheader.loopexit: ; preds = %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i + br label %for.cond16.preheader + +for.cond16.preheader.loopexit179: ; preds = %for.inc.i.i + br label %for.cond16.preheader + +for.cond16.preheader: ; preds = %for.cond.preheader.i.i, %_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i, %for.cond16.preheader.loopexit179, %for.cond16.preheader.loopexit, %for.cond.cleanup6 + %arrayidx24 = getelementptr inbounds i8, i8* %labels, i64 %indvars.iv169 + %10 = load i8, i8* %arrayidx24, align 1, !tbaa !93 + %conv25 = zext i8 %10 to i32 + %cProb20.sroa.3.0..sroa_idx62 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 0, i32 1 + %cProb20.sroa.3.0.copyload = load i32, i32* %cProb20.sroa.3.0..sroa_idx62, align 4 + %cmp26 = icmp eq i32 %cProb20.sroa.3.0.copyload, %conv25 + %cProb20.sroa.3.0..sroa_idx62.1 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 1, i32 1 + %cProb20.sroa.3.0.copyload.1 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.1, align 4 + %cmp26.1 = icmp eq i32 %cProb20.sroa.3.0.copyload.1, %conv25 + %narrow = or i1 %cmp26.1, %cmp26 + %cProb20.sroa.3.0..sroa_idx62.2 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 2, i32 1 + %cProb20.sroa.3.0.copyload.2 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.2, align 4 + %cmp26.2 = icmp eq i32 %cProb20.sroa.3.0.copyload.2, %conv25 + %narrow174 = or i1 %cmp26.2, %narrow + %cProb20.sroa.3.0..sroa_idx62.3 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 3, i32 1 + %cProb20.sroa.3.0.copyload.3 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.3, align 4 + %cmp26.3 = icmp eq i32 %cProb20.sroa.3.0.copyload.3, %conv25 + %narrow175 = or i1 %cmp26.3, %narrow174 + %cProb20.sroa.3.0..sroa_idx62.4 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 4, i32 1 + %cProb20.sroa.3.0.copyload.4 = load i32, i32* %cProb20.sroa.3.0..sroa_idx62.4, align 4 + %cmp26.4 = icmp eq i32 %cProb20.sroa.3.0.copyload.4, %conv25 + %narrow176 = or i1 %cmp26.4, %narrow175 + %11 = xor i1 %narrow176, true + %12 = zext i1 %11 to i32 + %add31.num_errors.0 = add nsw i32 %12, %num_errors.0164 + %tobool.i.i.i117 = icmp eq i64 %elem_probs.sroa.0.0.lcssa, 0 + br i1 %tobool.i.i.i117, label %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit, label %if.then.i.i.i + +if.then.i.i: ; preds = %for.cond.cleanup6 + %sub.ptr.sub.i.i.i = sub i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8.146, %elem_probs.sroa.0.0.lcssa + %sub.ptr.div.i.i.i = ashr exact i64 %sub.ptr.sub.i.i.i, 3 + %13 = tail call i64 @llvm.ctlz.i64(i64 %sub.ptr.div.i.i.i, i1 true) #2 + %sub.i.i.i = shl nuw nsw i64 %13, 1 + %mul.i.i = xor i64 %sub.i.i.i, 126 + tail call void @_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_(%struct.ClassProb* %8, %struct.ClassProb* %9, i64 %mul.i.i, i1 (i64, i64)* nonnull @_Z16descendFloatComp9ClassProbS_) #2 + %cmp.i = icmp sgt i64 %sub.ptr.sub.i.i.i, 128 + br i1 %cmp.i, label %for.body.lr.ph.i30.i, label %for.cond.preheader.i.i + +for.body.lr.ph.i30.i: ; preds = %if.then.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i.i28.i = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i64* + %14 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i8* + br label %for.body.i37.i + +for.body.i37.i: ; preds = %for.inc.i60.i, %for.body.lr.ph.i30.i + %incdec.ptr.i54.i31.idx.i = phi i64 [ 1, %for.body.lr.ph.i30.i ], [ %incdec.ptr.i54.i31.add.i, %for.inc.i60.i ] + %__i.sroa.0.0.sink53.i32.i = phi %struct.ClassProb* [ %8, %for.body.lr.ph.i30.i ], [ %incdec.ptr.i54.i31.ptr.i, %for.inc.i60.i ] + %incdec.ptr.i54.i31.ptr.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 %incdec.ptr.i54.i31.idx.i + %agg.tmp.sroa.0.0..sroa_cast.i.i33.i = bitcast %struct.ClassProb* %incdec.ptr.i54.i31.ptr.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i34.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i33.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i.i35.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i28.i, align 4 + %obj1.sroa.0.0.extract.trunc.i101 = trunc i64 %agg.tmp.sroa.0.0.copyload.i.i34.i to i32 + %15 = bitcast i32 %obj1.sroa.0.0.extract.trunc.i101 to float + %obj2.sroa.0.0.extract.trunc.i102 = trunc i64 %agg.tmp3.sroa.0.0.copyload.i.i35.i to i32 + %16 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i102 to float + %cmp.i103 = fcmp fast ogt float %15, %16 + br i1 %cmp.i103, label %if.then10.i42.i, label %if.else.i50.i + +if.then10.i42.i: ; preds = %for.body.i37.i + %incdec.ptr.i54.i31.ptr.idx.i = shl nuw i64 %incdec.ptr.i54.i31.idx.i, 3 + %sub.ptr.div.i.i.i.i.i40.i = ashr exact i64 %incdec.ptr.i54.i31.ptr.idx.i, 3 + %add.ptr.i41.i43.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.0.sink53.i32.i, i64 2 + %.pre.i.i.i.i.i44.i = sub nsw i64 0, %sub.ptr.div.i.i.i.i.i40.i + %.pre9.i.i.i.i.i45.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %add.ptr.i41.i43.i, i64 %.pre.i.i.i.i.i44.i + %17 = bitcast %struct.ClassProb* %.pre9.i.i.i.i.i45.i to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %17, i8* nonnull %14, i64 %incdec.ptr.i54.i31.ptr.idx.i, i32 4, i1 false) #2 + br label %for.inc.i60.i + +if.else.i50.i: ; preds = %for.body.i37.i + %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i47.i = bitcast %struct.ClassProb* %__i.sroa.0.0.sink53.i32.i to i64* + %agg.tmp2.sroa.0.0.copyload.i27.i.i48.i = load i64, i64* %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i47.i, align 4 + %obj2.sroa.0.0.extract.trunc.i90 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i27.i.i48.i to i32 + %18 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i90 to float + %cmp.i91 = fcmp fast ogt float %15, %18 + br i1 %cmp.i91, label %while.body.i.i56.i.preheader, label %for.inc.i60.i + +while.body.i.i56.i.preheader: ; preds = %if.else.i50.i + br label %while.body.i.i56.i + +while.body.i.i56.i: ; preds = %while.body.i.i56.i, %while.body.i.i56.i.preheader + %19 = phi i64 [ %agg.tmp2.sroa.0.0.copyload.i.i.i54.i, %while.body.i.i56.i ], [ %agg.tmp2.sroa.0.0.copyload.i27.i.i48.i, %while.body.i.i56.i.preheader ] + %20 = phi i64* [ %indvars55.i51.i, %while.body.i.i56.i ], [ %agg.tmp.sroa.0.0..sroa_cast.i.i33.i, %while.body.i.i56.i.preheader ] + %21 = phi %struct.ClassProb* [ %incdec.ptr.i.i.i52.i, %while.body.i.i56.i ], [ %__i.sroa.0.0.sink53.i32.i, %while.body.i.i56.i.preheader ] + %indvars55.i51.i = bitcast %struct.ClassProb* %21 to i64* + store i64 %19, i64* %20, align 4 + %incdec.ptr.i.i.i52.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %21, i64 -1 + %indvars.i53.i = bitcast %struct.ClassProb* %incdec.ptr.i.i.i52.i to i64* + %agg.tmp2.sroa.0.0.copyload.i.i.i54.i = load i64, i64* %indvars.i53.i, align 4 + %obj2.sroa.0.0.extract.trunc.i99 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i.i.i54.i to i32 + %22 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i99 to float + %cmp.i100 = fcmp fast ogt float %15, %22 + br i1 %cmp.i100, label %while.body.i.i56.i, label %for.inc.i60.i.loopexit + +for.inc.i60.i.loopexit: ; preds = %while.body.i.i56.i + br label %for.inc.i60.i + +for.inc.i60.i: ; preds = %for.inc.i60.i.loopexit, %if.else.i50.i, %if.then10.i42.i + %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i57.i = phi i64* [ %agg.tmp.sroa.0.0..sroa_cast.i.i33.i, %if.else.i50.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i.i28.i, %if.then10.i42.i ], [ %indvars55.i51.i, %for.inc.i60.i.loopexit ] + store i64 %agg.tmp.sroa.0.0.copyload.i.i34.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i57.i, align 4 + %incdec.ptr.i54.i31.add.i = add nuw nsw i64 %incdec.ptr.i54.i31.idx.i, 1 + %cmp.i38.i59.i = icmp eq i64 %incdec.ptr.i54.i31.add.i, 16 + br i1 %cmp.i38.i59.i, label %_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i, label %for.body.i37.i + +_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i: ; preds = %for.inc.i60.i + %add.ptr.i.i86 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 16 + %cmp.i13.i.i = icmp eq %struct.ClassProb* %add.ptr.i.i86, %9 + br i1 %cmp.i13.i.i, label %for.cond16.preheader, label %for.body.i67.i.preheader + +for.body.i67.i.preheader: ; preds = %_ZSt16__insertion_sortIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_.exit61.i + br label %for.body.i67.i + +for.body.i67.i: ; preds = %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i, %for.body.i67.i.preheader + %__i.sroa.0.014.i.i = phi %struct.ClassProb* [ %incdec.ptr.i.i73.i, %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i ], [ %add.ptr.i.i86, %for.body.i67.i.preheader ] + %23 = bitcast %struct.ClassProb* %__i.sroa.0.014.i.i to i64* + %24 = load i64, i64* %23, align 4 + %incdec.ptr.i25.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.014.i.i, i64 -1 + %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i64.i = bitcast %struct.ClassProb* %incdec.ptr.i25.i.i.i to i64* + %agg.tmp2.sroa.0.0.copyload.i27.i.i65.i = load i64, i64* %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i64.i, align 4 + %obj1.sroa.0.0.extract.trunc.i = trunc i64 %24 to i32 + %25 = bitcast i32 %obj1.sroa.0.0.extract.trunc.i to float + %obj2.sroa.0.0.extract.trunc.i = trunc i64 %agg.tmp2.sroa.0.0.copyload.i27.i.i65.i to i32 + %26 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i to float + %cmp.i88 = fcmp fast ogt float %25, %26 + br i1 %cmp.i88, label %while.body.i.i72.i.preheader, label %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i + +while.body.i.i72.i.preheader: ; preds = %for.body.i67.i + br label %while.body.i.i72.i + +while.body.i.i72.i: ; preds = %while.body.i.i72.i, %while.body.i.i72.i.preheader + %27 = phi i64 [ %agg.tmp2.sroa.0.0.copyload.i.i.i70.i, %while.body.i.i72.i ], [ %agg.tmp2.sroa.0.0.copyload.i27.i.i65.i, %while.body.i.i72.i.preheader ] + %28 = phi i64* [ %indvars15.i.i, %while.body.i.i72.i ], [ %23, %while.body.i.i72.i.preheader ] + %29 = phi %struct.ClassProb* [ %incdec.ptr.i.i.i68.i, %while.body.i.i72.i ], [ %incdec.ptr.i25.i.i.i, %while.body.i.i72.i.preheader ] + %indvars15.i.i = bitcast %struct.ClassProb* %29 to i64* + store i64 %27, i64* %28, align 4 + %incdec.ptr.i.i.i68.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %29, i64 -1 + %indvars.i69.i = bitcast %struct.ClassProb* %incdec.ptr.i.i.i68.i to i64* + %agg.tmp2.sroa.0.0.copyload.i.i.i70.i = load i64, i64* %indvars.i69.i, align 4 + %obj2.sroa.0.0.extract.trunc.i96 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i.i.i70.i to i32 + %30 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i96 to float + %cmp.i97 = fcmp fast ogt float %25, %30 + br i1 %cmp.i97, label %while.body.i.i72.i, label %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit + +_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit: ; preds = %while.body.i.i72.i + br label %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i + +_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i: ; preds = %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit, %for.body.i67.i + %.lcssa.i.i.i = phi i64* [ %23, %for.body.i67.i ], [ %indvars15.i.i, %_ZSt25__unguarded_linear_insertIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops14_Val_comp_iterIPFbS2_S2_EEEEvT_T0_.exit.i.i.loopexit ] + store i64 %24, i64* %.lcssa.i.i.i, align 4 + %incdec.ptr.i.i73.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.014.i.i, i64 1 + %cmp.i.i74.i = icmp eq %struct.ClassProb* %incdec.ptr.i.i73.i, %9 + br i1 %cmp.i.i74.i, label %for.cond16.preheader.loopexit, label %for.body.i67.i + +for.cond.preheader.i.i: ; preds = %if.then.i.i + %incdec.ptr.i51.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %8, i64 1 + %cmp.i3852.i.i = icmp eq %struct.ClassProb* %incdec.ptr.i51.i.i, %9 + br i1 %cmp.i3852.i.i, label %for.cond16.preheader, label %for.body.lr.ph.i.i + +for.body.lr.ph.i.i: ; preds = %for.cond.preheader.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i.i.i = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i64* + %31 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i8* + br label %for.body.i.i + +for.body.i.i: ; preds = %for.inc.i.i, %for.body.lr.ph.i.i + %incdec.ptr.i54.i.i = phi %struct.ClassProb* [ %incdec.ptr.i51.i.i, %for.body.lr.ph.i.i ], [ %incdec.ptr.i.i.i, %for.inc.i.i ] + %__i.sroa.0.0.sink53.i.i = phi %struct.ClassProb* [ %8, %for.body.lr.ph.i.i ], [ %incdec.ptr.i54.i.i, %for.inc.i.i ] + %agg.tmp.sroa.0.0..sroa_cast.i.i.i = bitcast %struct.ClassProb* %incdec.ptr.i54.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i.i, align 4 + %obj1.sroa.0.0.extract.trunc.i107 = trunc i64 %agg.tmp.sroa.0.0.copyload.i.i.i to i32 + %32 = bitcast i32 %obj1.sroa.0.0.extract.trunc.i107 to float + %obj2.sroa.0.0.extract.trunc.i108 = trunc i64 %agg.tmp3.sroa.0.0.copyload.i.i.i to i32 + %33 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i108 to float + %cmp.i109 = fcmp fast ogt float %32, %33 + br i1 %cmp.i109, label %if.then10.i.i, label %if.else.i.i + +if.then10.i.i: ; preds = %for.body.i.i + %sub.ptr.lhs.cast.i.i.i.i.i.i = ptrtoint %struct.ClassProb* %incdec.ptr.i54.i.i to i64 + %sub.ptr.sub.i.i.i.i.i.i = sub i64 %sub.ptr.lhs.cast.i.i.i.i.i.i, %elem_probs.sroa.0.0.lcssa + %sub.ptr.div.i.i.i.i.i.i = ashr exact i64 %sub.ptr.sub.i.i.i.i.i.i, 3 + %tobool.i.i.i.i.i.i = icmp eq i64 %sub.ptr.div.i.i.i.i.i.i, 0 + br i1 %tobool.i.i.i.i.i.i, label %for.inc.i.i, label %if.then.i.i.i.i.i.i + +if.then.i.i.i.i.i.i: ; preds = %if.then10.i.i + %add.ptr.i41.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__i.sroa.0.0.sink53.i.i, i64 2 + %.pre.i.i.i.i.i.i = sub nsw i64 0, %sub.ptr.div.i.i.i.i.i.i + %.pre9.i.i.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %add.ptr.i41.i.i, i64 %.pre.i.i.i.i.i.i + %34 = bitcast %struct.ClassProb* %.pre9.i.i.i.i.i.i to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %34, i8* nonnull %31, i64 %sub.ptr.sub.i.i.i.i.i.i, i32 4, i1 false) #2 + br label %for.inc.i.i + +if.else.i.i: ; preds = %for.body.i.i + %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i.i = bitcast %struct.ClassProb* %__i.sroa.0.0.sink53.i.i to i64* + %agg.tmp2.sroa.0.0.copyload.i27.i.i.i = load i64, i64* %agg.tmp2.sroa.0.0..sroa_cast.i26.i.i.i, align 4 + %obj2.sroa.0.0.extract.trunc.i93 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i27.i.i.i to i32 + %35 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i93 to float + %cmp.i94 = fcmp fast ogt float %32, %35 + br i1 %cmp.i94, label %while.body.i.i.i.preheader, label %for.inc.i.i + +while.body.i.i.i.preheader: ; preds = %if.else.i.i + br label %while.body.i.i.i + +while.body.i.i.i: ; preds = %while.body.i.i.i, %while.body.i.i.i.preheader + %36 = phi i64 [ %agg.tmp2.sroa.0.0.copyload.i.i.i.i, %while.body.i.i.i ], [ %agg.tmp2.sroa.0.0.copyload.i27.i.i.i, %while.body.i.i.i.preheader ] + %37 = phi i64* [ %indvars55.i.i, %while.body.i.i.i ], [ %agg.tmp.sroa.0.0..sroa_cast.i.i.i, %while.body.i.i.i.preheader ] + %38 = phi %struct.ClassProb* [ %incdec.ptr.i.i.i.i, %while.body.i.i.i ], [ %__i.sroa.0.0.sink53.i.i, %while.body.i.i.i.preheader ] + %indvars55.i.i = bitcast %struct.ClassProb* %38 to i64* + store i64 %36, i64* %37, align 4 + %incdec.ptr.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %38, i64 -1 + %indvars.i.i = bitcast %struct.ClassProb* %incdec.ptr.i.i.i.i to i64* + %agg.tmp2.sroa.0.0.copyload.i.i.i.i = load i64, i64* %indvars.i.i, align 4 + %obj2.sroa.0.0.extract.trunc.i105 = trunc i64 %agg.tmp2.sroa.0.0.copyload.i.i.i.i to i32 + %39 = bitcast i32 %obj2.sroa.0.0.extract.trunc.i105 to float + %cmp.i106 = fcmp fast ogt float %32, %39 + br i1 %cmp.i106, label %while.body.i.i.i, label %for.inc.i.i.loopexit + +for.inc.i.i.loopexit: ; preds = %while.body.i.i.i + br label %for.inc.i.i + +for.inc.i.i: ; preds = %for.inc.i.i.loopexit, %if.else.i.i, %if.then.i.i.i.i.i.i, %if.then10.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i.i = phi i64* [ %agg.tmp.sroa.0.0..sroa_cast.i.i.i, %if.else.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i.i.i, %if.then10.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i.i.i, %if.then.i.i.i.i.i.i ], [ %indvars55.i.i, %for.inc.i.i.loopexit ] + store i64 %agg.tmp.sroa.0.0.copyload.i.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.sink.i.i, align 4 + %incdec.ptr.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %incdec.ptr.i54.i.i, i64 1 + %cmp.i38.i.i = icmp eq %struct.ClassProb* %incdec.ptr.i.i.i, %9 + br i1 %cmp.i38.i.i, label %for.cond16.preheader.loopexit179, label %for.body.i.i + +for.body7: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge, %for.body7.lr.ph + %elem_probs.sroa.15.0.elem_probs.sroa.15.16. = phi %struct.ClassProb* [ null, %for.body7.lr.ph ], [ %elem_probs.sroa.15.0.elem_probs.sroa.15.16..pre, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i = phi i64 [ 0, %for.body7.lr.ph ], [ %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %indvars.iv = phi i64 [ 0, %for.body7.lr.ph ], [ %indvars.iv.next, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %elem_probs.sroa.0.0157 = phi i64 [ 0, %for.body7.lr.ph ], [ %elem_probs.sroa.0.1, %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge ] + %add = add i64 %indvars.iv, %mul + %arrayidx9 = getelementptr inbounds float, float* %5, i64 %add + %40 = bitcast float* %arrayidx9 to i32* + %41 = load i32, i32* %40, align 4, !tbaa !71 + %42 = inttoptr i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i to %struct.ClassProb* + %cmp.i111 = icmp eq %struct.ClassProb* %42, %elem_probs.sroa.15.0.elem_probs.sroa.15.16. + br i1 %cmp.i111, label %if.else.i112, label %if.then.i + +if.then.i: ; preds = %for.body7 + %43 = inttoptr i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i to i64* + %cProb.sroa.5.0.insert.shift = shl nuw i64 %indvars.iv, 32 + %cProb.sroa.0.0.insert.ext = zext i32 %41 to i64 + %cProb.sroa.0.0.insert.insert = or i64 %cProb.sroa.0.0.insert.ext, %cProb.sroa.5.0.insert.shift + store i64 %cProb.sroa.0.0.insert.insert, i64* %43, align 4 + %elem_probs.sroa.9.0.elem_probs.sroa.9.8.145152 = load %struct.ClassProb*, %struct.ClassProb** %elem_probs.sroa.9.0._M_finish.i110.sroa_cast, align 8 + %incdec.ptr.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %elem_probs.sroa.9.0.elem_probs.sroa.9.8.145152, i64 1 + br label %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + +if.else.i112: ; preds = %for.body7 + %sub.ptr.sub.i21.i.i.i = sub i64 %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre.i.i, %elem_probs.sroa.0.0157 + %sub.ptr.div.i22.i.i.i = ashr exact i64 %sub.ptr.sub.i21.i.i.i, 3 + %cmp.i.i.i.i = icmp eq i64 %sub.ptr.div.i22.i.i.i, 0 + %.sroa.speculated.i.i.i = select i1 %cmp.i.i.i.i, i64 1, i64 %sub.ptr.div.i22.i.i.i + %add.i.i.i = add nsw i64 %.sroa.speculated.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp7.i.i.i = icmp ult i64 %add.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp9.i.i.i = icmp ugt i64 %add.i.i.i, 2305843009213693951 + %or.cond.i.i.i = or i1 %cmp7.i.i.i, %cmp9.i.i.i + %cond.i.i.i = select i1 %or.cond.i.i.i, i64 2305843009213693951, i64 %add.i.i.i + %cmp.i35.i.i = icmp eq i64 %cond.i.i.i, 0 + br i1 %cmp.i35.i.i, label %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i, label %cond.true.i.i.i + +cond.true.i.i.i: ; preds = %if.else.i112 + %cmp.i.i.i.i.i = icmp ugt i64 %cond.i.i.i, 2305843009213693951 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i113, label %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i + +if.then.i.i.i.i.i113: ; preds = %cond.true.i.i.i + tail call void @_ZSt17__throw_bad_allocv() #13 + unreachable + +_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i: ; preds = %cond.true.i.i.i + %mul.i.i.i.i.i = shl i64 %cond.i.i.i, 3 + %call2.i.i.i.i.i = tail call i8* @_Znwm(i64 %mul.i.i.i.i.i) #2 + %44 = bitcast i8* %call2.i.i.i.i.i to %struct.ClassProb* + br label %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i + +_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i: ; preds = %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i, %if.else.i112 + %45 = phi i8* [ %call2.i.i.i.i.i, %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i ], [ null, %if.else.i112 ] + %cond.i36.i.i = phi %struct.ClassProb* [ %44, %_ZNSt16allocator_traitsISaI9ClassProbEE8allocateERS1_m.exit.i.i.i ], [ null, %if.else.i112 ] + %add.ptr.i.i116 = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %cond.i36.i.i, i64 %sub.ptr.div.i22.i.i.i + %46 = bitcast %struct.ClassProb* %add.ptr.i.i116 to i64* + %cProb.sroa.5.0.insert.shift137 = shl nuw i64 %indvars.iv, 32 + %cProb.sroa.0.0.insert.ext131 = zext i32 %41 to i64 + %cProb.sroa.0.0.insert.insert133 = or i64 %cProb.sroa.0.0.insert.ext131, %cProb.sroa.5.0.insert.shift137 + store i64 %cProb.sroa.0.0.insert.insert133, i64* %46, align 4 + br i1 %cmp.i.i.i.i, label %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i + +if.then.i.i.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i + %47 = inttoptr i64 %elem_probs.sroa.0.0157 to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %45, i8* %47, i64 %sub.ptr.sub.i21.i.i.i, i32 4, i1 false) #2 + br label %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i + +_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i: ; preds = %if.then.i.i.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseI9ClassProbSaIS0_EE11_M_allocateEm.exit.i.i + %incdec.ptr.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %add.ptr.i.i116, i64 1 + %tobool.i.i.i = icmp eq i64 %elem_probs.sroa.0.0157, 0 + br i1 %tobool.i.i.i, label %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i, label %if.then.i37.i.i + +if.then.i37.i.i: ; preds = %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i + %48 = inttoptr i64 %elem_probs.sroa.0.0157 to i8* + tail call void @_ZdlPv(i8* %48) #2 + br label %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i + +_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i: ; preds = %if.then.i37.i.i, %_ZSt34__uninitialized_move_if_noexcept_aIP9ClassProbS1_SaIS0_EET0_T_S4_S3_RT1_.exit.i.i + %49 = ptrtoint i8* %45 to i64 + %50 = ptrtoint %struct.ClassProb* %incdec.ptr.i.i to i64 + store i64 %50, i64* %elem_probs.sroa.9, align 8 + %add.ptr23.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %cond.i36.i.i, i64 %cond.i.i.i + br label %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + +_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i, %if.then.i + %elem_probs.sroa.0.1 = phi i64 [ %49, %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i ], [ %elem_probs.sroa.0.0157, %if.then.i ] + %_M_end_of_storage.sink.i = phi %struct.ClassProb** [ %elem_probs.sroa.15, %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i ], [ %elem_probs.sroa.9.0._M_finish.i110.sroa_cast, %if.then.i ] + %add.ptr23.i.sink.i = phi %struct.ClassProb* [ %add.ptr23.i.i, %_ZNSt6vectorI9ClassProbSaIS0_EE19_M_emplace_back_auxIJRKS0_EEEvDpOT_.exit.i ], [ %incdec.ptr.i, %if.then.i ] + store %struct.ClassProb* %add.ptr23.i.sink.i, %struct.ClassProb** %_M_end_of_storage.sink.i, align 8, !tbaa !124 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %cmp5 = icmp ult i64 %indvars.iv.next, %6 + %elem_probs.sroa.9.0.elem_probs.sroa.9.8..pre = load i64, i64* %elem_probs.sroa.9, align 8 + br i1 %cmp5, label %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge, label %for.cond.cleanup6.loopexit + +_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit.for.body7_crit_edge: ; preds = %_ZNSt6vectorI9ClassProbSaIS0_EE9push_backERKS0_.exit + %elem_probs.sroa.15.0.elem_probs.sroa.15.16..pre = load %struct.ClassProb*, %struct.ClassProb** %elem_probs.sroa.15, align 8 + br label %for.body7 + +if.then.i.i.i: ; preds = %for.cond16.preheader + %51 = inttoptr i64 %elem_probs.sroa.0.0.lcssa to i8* + tail call void @_ZdlPv(i8* %51) #2 + br label %_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit + +_ZNSt6vectorI9ClassProbSaIS0_EED2Ev.exit: ; preds = %if.then.i.i.i, %for.cond16.preheader + call void @llvm.lifetime.end(i64 8, i8* nonnull %elem_probs.sroa.9.0..sroa_cast151) + call void @llvm.lifetime.end(i64 8, i8* nonnull %elem_probs.sroa.15.0..sroa_cast149) + %indvars.iv.next170 = add nuw nsw i64 %indvars.iv169, 1 + %cmp = icmp slt i64 %indvars.iv.next170, %7 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit + +if.then47: ; preds = %for.cond.cleanup + %52 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %52) #2 + %53 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %54 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %53, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %54) #2 + %55 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %53, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %55, align 16, !tbaa !97 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %56 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %56, i8 0, i64 32, i32 8, i1 false) #2 + %57 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %58 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %57, i64* %58, align 16, !tbaa !97 + %59 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %60 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i118 = inttoptr i64 %57 to i8* + %vbase.offset.ptr.i.i119 = getelementptr i8, i8* %vtable.cast.i.i118, i64 -24 + %61 = bitcast i8* %vbase.offset.ptr.i.i119 to i64* + %vbase.offset.i.i120 = load i64, i64* %61, align 8 + %add.ptr.i.i121 = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i.i120 + %62 = bitcast i8* %add.ptr.i.i121 to i64* + store i64 %59, i64* %62, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %60, align 16, !tbaa !97 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %63 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %63, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %52, i64 %vbase.offset5.i.i + %64 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %64, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %55, align 16, !tbaa !97 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %65 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %66 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %66, align 16, !tbaa !97 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %67 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %67, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %65, align 8, !tbaa !97 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %68 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %69 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %68, %union.anon** %69, align 8, !tbaa !109 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + %.cast.i.i.i = bitcast %union.anon* %68 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %60, align 16, !tbaa !97 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %70 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %70, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i + %71 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %72 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %71, %"class.std::basic_streambuf"* %72) #2 + %73 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i122 = load i8*, i8** %60, align 16, !tbaa !97 + %vbase.offset.ptr.i123 = getelementptr i8, i8* %vtable.i122, i64 -24 + %74 = bitcast i8* %vbase.offset.ptr.i123 to i64* + %vbase.offset.i124 = load i64, i64* %74, align 8 + %add.ptr.i125 = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i124 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i125, i64 24 + %75 = bitcast i8* %_M_flags.i.i to i32* + %76 = load i32, i32* %75, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %76, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %75, align 4, !tbaa !111 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %73, double %conv43) #2 + %77 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %77) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %78 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %79 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call52 = call i64 @fwrite(i8* %78, i64 1, i64 %79, %struct._IO_FILE* nonnull %call45) + %80 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %81 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %81 to i8* + %cmp.i.i.i127 = icmp eq i8* %80, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i127, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i128 + +if.then.i.i128: ; preds = %if.then47 + call void @_ZdlPv(i8* %80) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i128, %if.then47 + call void @llvm.lifetime.end(i64 32, i8* nonnull %77) #2 + %82 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %82, i64* %58, align 16, !tbaa !97 + %83 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i = inttoptr i64 %82 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %84 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %84, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %52, i64 %vbase.offset.i.i + %85 = bitcast i8* %add.ptr.i.i to i64* + store i64 %83, i64* %85, align 8, !tbaa !97 + %86 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %86, align 8, !tbaa !97 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %87 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %87, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %87) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %86, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %88 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %88) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %52) #2 + br label %if.end53 + +if.end53: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %for.cond.cleanup + %call54 = call i32 @fclose(%struct._IO_FILE* %call45) + ret float %conv42 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #4 + +; Function Attrs: nounwind uwtable +define void @_Z17dumpFinalAccuracyf(float %accuracy) local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %conv = fpext float %accuracy to double + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.26, i64 0, i64 0), double %conv) + %call1 = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([15 x i8], [15 x i8]* @.str.22, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call1, null + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %0 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %0) #2 + %1 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %2 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %2) #2 + %3 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !97 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %4 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 32, i32 8, i1 false) #2 + %5 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %6 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %5, i64* %6, align 16, !tbaa !97 + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %5 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %9 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %9, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i + %10 = bitcast i8* %add.ptr.i.i to i64* + store i64 %7, i64* %10, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %11, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset5.i.i + %12 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %12, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !97 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %13 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %14 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %14, align 16, !tbaa !97 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %15 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %15, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %13, align 8, !tbaa !97 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %16 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %17 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %16, %union.anon** %17, align 8, !tbaa !109 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + %.cast.i.i.i = bitcast %union.anon* %16 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %18 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %18, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i + %19 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %20 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %19, %"class.std::basic_streambuf"* %20) #2 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i11 = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr.i12 = getelementptr i8, i8* %vtable.i11, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i12 to i64* + %vbase.offset.i13 = load i64, i64* %22, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i13 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %23 = bitcast i8* %_M_flags.i.i to i32* + %24 = load i32, i32* %23, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %24, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %23, align 4, !tbaa !111 + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %21, double %conv) #2 + %25 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %25) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %26 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %27 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call6 = call i64 @fwrite(i8* %26, i64 1, i64 %27, %struct._IO_FILE* nonnull %call1) + %28 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %29 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %29 to i8* + %cmp.i.i.i = icmp eq i8* %28, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then + call void @_ZdlPv(i8* %28) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %if.then + call void @llvm.lifetime.end(i64 32, i8* nonnull %25) #2 + %30 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %30, i64* %6, align 16, !tbaa !97 + %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i15 = inttoptr i64 %30 to i8* + %vbase.offset.ptr.i.i16 = getelementptr i8, i8* %vtable.cast.i.i15, i64 -24 + %32 = bitcast i8* %vbase.offset.ptr.i.i16 to i64* + %vbase.offset.i.i17 = load i64, i64* %32, align 8 + %add.ptr.i.i18 = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i17 + %33 = bitcast i8* %add.ptr.i.i18 to i64* + store i64 %31, i64* %33, align 8, !tbaa !97 + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %35, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %35) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %36 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %36) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %0) #2 + br label %if.end + +if.end: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %entry + %call7 = call i32 @fclose(%struct._IO_FILE* %call1) + %37 = load float*, float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1), align 8, !tbaa !125 + %38 = load float*, float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 2), align 8, !tbaa !126 + %cmp.i = icmp eq float* %37, %38 + %39 = ptrtoint float* %37 to i64 + br i1 %cmp.i, label %if.else.i, label %if.then.i + +if.then.i: ; preds = %if.end + store float %accuracy, float* %37, align 4, !tbaa !71 + %incdec.ptr.i = getelementptr inbounds float, float* %37, i64 1 + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +if.else.i: ; preds = %if.end + %40 = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !52 + %sub.ptr.sub.i21.i.i.i = sub i64 %39, %40 + %sub.ptr.div.i22.i.i.i = ashr exact i64 %sub.ptr.sub.i21.i.i.i, 2 + %cmp.i.i.i.i = icmp eq i64 %sub.ptr.div.i22.i.i.i, 0 + %.sroa.speculated.i.i.i = select i1 %cmp.i.i.i.i, i64 1, i64 %sub.ptr.div.i22.i.i.i + %add.i.i.i = add nsw i64 %.sroa.speculated.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp7.i.i.i = icmp ult i64 %add.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp9.i.i.i = icmp ugt i64 %add.i.i.i, 4611686018427387903 + %or.cond.i.i.i = or i1 %cmp7.i.i.i, %cmp9.i.i.i + %cond.i.i.i = select i1 %or.cond.i.i.i, i64 4611686018427387903, i64 %add.i.i.i + %cmp.i35.i.i = icmp eq i64 %cond.i.i.i, 0 + br i1 %cmp.i35.i.i, label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i, label %cond.true.i.i.i + +cond.true.i.i.i: ; preds = %if.else.i + %cmp.i.i.i.i.i = icmp ugt i64 %cond.i.i.i, 4611686018427387903 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i19, label %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i + +if.then.i.i.i.i.i19: ; preds = %cond.true.i.i.i + call void @_ZSt17__throw_bad_allocv() #13 + unreachable + +_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i: ; preds = %cond.true.i.i.i + %mul.i.i.i.i.i = shl i64 %cond.i.i.i, 2 + %call2.i.i.i.i.i = call i8* @_Znwm(i64 %mul.i.i.i.i.i) #2 + %41 = bitcast i8* %call2.i.i.i.i.i to float* + %.pre.i.i = load i64, i64* bitcast (float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1) to i64*), align 8, !tbaa !125 + %.pre38.i.i = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !52 + br label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i: ; preds = %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i, %if.else.i + %.in.i.i = phi i64 [ %.pre38.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ %40, %if.else.i ] + %42 = phi i64 [ %.pre.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ %39, %if.else.i ] + %43 = phi i8* [ %call2.i.i.i.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %cond.i36.i.i = phi float* [ %41, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %sub.ptr.sub.i.i.i = sub i64 %42, %.in.i.i + %sub.ptr.div.i.i.i = ashr exact i64 %sub.ptr.sub.i.i.i, 2 + %add.ptr.i.i20 = getelementptr inbounds float, float* %cond.i36.i.i, i64 %sub.ptr.div.i.i.i + store float %accuracy, float* %add.ptr.i.i20, align 4, !tbaa !71 + %tobool.i.i.i.i.i.i.i.i.i.i = icmp eq i64 %sub.ptr.div.i.i.i, 0 + br i1 %tobool.i.i.i.i.i.i.i.i.i.i, label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i + +if.then.i.i.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %44 = inttoptr i64 %.in.i.i to i8* + call void @llvm.memmove.p0i8.p0i8.i64(i8* %43, i8* %44, i64 %sub.ptr.sub.i.i.i, i32 4, i1 false) #2 + br label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + +_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i: ; preds = %if.then.i.i.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %incdec.ptr.i.i = getelementptr inbounds float, float* %add.ptr.i.i20, i64 1 + %tobool.i.i.i = icmp eq i64 %.in.i.i, 0 + br i1 %tobool.i.i.i, label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i, label %if.then.i37.i.i + +if.then.i37.i.i: ; preds = %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + %45 = inttoptr i64 %.in.i.i to i8* + call void @_ZdlPv(i8* %45) #2 + br label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i + +_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i: ; preds = %if.then.i37.i.i, %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + store i8* %43, i8** bitcast (%"class.std::vector"* @run_accuracies to i8**), align 8, !tbaa !52 + store float* %incdec.ptr.i.i, float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1), align 8, !tbaa !125 + %add.ptr23.i.i = getelementptr inbounds float, float* %cond.i36.i.i, i64 %cond.i.i.i + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +_ZNSt6vectorIfSaIfEE9push_backERKf.exit: ; preds = %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i, %if.then.i + %_M_end_of_storage.sink.i = phi float** [ getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 2), %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1), %if.then.i ] + %add.ptr23.i.sink.i = phi float* [ %add.ptr23.i.i, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %incdec.ptr.i, %if.then.i ] + store float* %add.ptr23.i.sink.i, float** %_M_end_of_storage.sink.i, align 8, !tbaa !124 + ret void +} + +; Function Attrs: nounwind uwtable +define void @_Z11dumpAvgPSNRf(float %avg_psnr) local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.27, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %0 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %0) #2 + %1 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %2 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %2) #2 + %3 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !97 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %4 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 32, i32 8, i1 false) #2 + %5 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %6 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %5, i64* %6, align 16, !tbaa !97 + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %5 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %9 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %9, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i + %10 = bitcast i8* %add.ptr.i.i to i64* + store i64 %7, i64* %10, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %11, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset5.i.i + %12 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %12, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !97 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %13 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %14 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %14, align 16, !tbaa !97 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %15 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %15, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %13, align 8, !tbaa !97 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %16 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %17 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %16, %union.anon** %17, align 8, !tbaa !109 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + %.cast.i.i.i = bitcast %union.anon* %16 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %18 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %18, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i + %19 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %20 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %19, %"class.std::basic_streambuf"* %20) #2 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i10 = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr.i11 = getelementptr i8, i8* %vtable.i10, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i11 to i64* + %vbase.offset.i12 = load i64, i64* %22, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i12 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %23 = bitcast i8* %_M_flags.i.i to i32* + %24 = load i32, i32* %23, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %24, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %23, align 4, !tbaa !111 + %conv.i = fpext float %avg_psnr to double + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %21, double %conv.i) #2 + %25 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %25) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %26 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %27 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call5 = call i64 @fwrite(i8* %26, i64 1, i64 %27, %struct._IO_FILE* nonnull %call) + %28 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %29 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %29 to i8* + %cmp.i.i.i = icmp eq i8* %28, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then + call void @_ZdlPv(i8* %28) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %if.then + call void @llvm.lifetime.end(i64 32, i8* nonnull %25) #2 + %30 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %30, i64* %6, align 16, !tbaa !97 + %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i14 = inttoptr i64 %30 to i8* + %vbase.offset.ptr.i.i15 = getelementptr i8, i8* %vtable.cast.i.i14, i64 -24 + %32 = bitcast i8* %vbase.offset.ptr.i.i15 to i64* + %vbase.offset.i.i16 = load i64, i64* %32, align 8 + %add.ptr.i.i17 = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i16 + %33 = bitcast i8* %add.ptr.i.i17 to i64* + store i64 %31, i64* %33, align 8, !tbaa !97 + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %35, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %35) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %36 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %36) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %0) #2 + br label %if.end + +if.end: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %entry + %call6 = call i32 @fclose(%struct._IO_FILE* %call) + ret void +} + +; Function Attrs: nounwind uwtable +define void @_Z11dumpPSNRStdf(float %psnr_std) local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.28, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.end, label %if.then + +if.then: ; preds = %entry + %0 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + call void @llvm.lifetime.start(i64 376, i8* nonnull %0) #2 + %1 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %2 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %2) #2 + %3 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %1, i64 0, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !97 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %4 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %4, i8 0, i64 32, i32 8, i1 false) #2 + %5 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %6 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + store i64 %5, i64* %6, align 16, !tbaa !97 + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %5 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %9 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %vbase.offset.i.i = load i64, i64* %9, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i + %10 = bitcast i8* %add.ptr.i.i to i64* + store i64 %7, i64* %10, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %11, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset5.i.i + %12 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %12, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %3, align 16, !tbaa !97 + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %13 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %14 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %14, align 16, !tbaa !97 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %15 = bitcast i8** %_M_in_beg.i.i.i to i8* + call void @llvm.memset.p0i8.i64(i8* %15, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %13, align 8, !tbaa !97 + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %16 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %17 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + store %union.anon* %16, %union.anon** %17, align 8, !tbaa !109 + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + %.cast.i.i.i = bitcast %union.anon* %16 to i8* + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %18 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %18, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i + %19 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + %20 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %19, %"class.std::basic_streambuf"* %20) #2 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %vtable.i10 = load i8*, i8** %8, align 16, !tbaa !97 + %vbase.offset.ptr.i11 = getelementptr i8, i8* %vtable.i10, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i11 to i64* + %vbase.offset.i12 = load i64, i64* %22, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i12 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %23 = bitcast i8* %_M_flags.i.i to i32* + %24 = load i32, i32* %23, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %24, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %23, align 4, !tbaa !111 + %conv.i = fpext float %psnr_std to double + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %21, double %conv.i) #2 + %25 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + call void @llvm.lifetime.start(i64 32, i8* nonnull %25) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %26 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %27 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call5 = call i64 @fwrite(i8* %26, i64 1, i64 %27, %struct._IO_FILE* nonnull %call) + %28 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %29 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %29 to i8* + %cmp.i.i.i = icmp eq i8* %28, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %if.then + call void @_ZdlPv(i8* %28) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %if.then + call void @llvm.lifetime.end(i64 32, i8* nonnull %25) #2 + %30 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + store i64 %30, i64* %6, align 16, !tbaa !97 + %31 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i14 = inttoptr i64 %30 to i8* + %vbase.offset.ptr.i.i15 = getelementptr i8, i8* %vtable.cast.i.i14, i64 -24 + %32 = bitcast i8* %vbase.offset.ptr.i.i15 to i64* + %vbase.offset.i.i16 = load i64, i64* %32, align 8 + %add.ptr.i.i17 = getelementptr inbounds i8, i8* %0, i64 %vbase.offset.i.i16 + %33 = bitcast i8* %add.ptr.i.i17 to i64* + store i64 %31, i64* %33, align 8, !tbaa !97 + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %35, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %35) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + %36 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* %36) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %0) #2 + br label %if.end + +if.end: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %entry + %call6 = call i32 @fclose(%struct._IO_FILE* %call) + ret void +} + +; Function Attrs: nounwind uwtable +define void @_Z23dumpExecutionAccuraciesv() local_unnamed_addr #3 { +entry: + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %call = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.29, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.end, label %for.cond.preheader + +for.cond.preheader: ; preds = %entry + %0 = load i64, i64* bitcast (float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1) to i64*), align 8, !tbaa !125 + %1 = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !52 + %cmp231 = icmp eq i64 %0, %1 + br i1 %cmp231, label %if.end, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %for.cond.preheader + %2 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + %3 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %4 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %3, i64 0, i32 0 + %5 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %3, i64 0, i32 0, i32 0 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %6 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + %7 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %8 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + %9 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %10 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i20 = inttoptr i64 %7 to i8* + %vbase.offset.ptr.i.i21 = getelementptr i8, i8* %vtable.cast.i.i20, i64 -24 + %11 = bitcast i8* %vbase.offset.ptr.i.i21 to i64* + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %12 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %13 = bitcast i8** %_M_in_beg.i.i.i to i8* + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %14 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %15 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + %.cast.i.i.i = bitcast %union.anon* %14 to i8* + %16 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + %17 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %18 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %19 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %19 to i8* + %20 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + %21 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i = inttoptr i64 %20 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %23 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %24 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + %25 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + br label %for.body + +for.body: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %for.body.lr.ph + %.in = phi i64 [ %1, %for.body.lr.ph ], [ %42, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %26 = inttoptr i64 %.in to float* + %add.ptr.i = getelementptr inbounds float, float* %26, i64 %indvars.iv + %27 = load float, float* %add.ptr.i, align 4, !tbaa !71 + call void @llvm.lifetime.start(i64 376, i8* nonnull %2) #2 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %4) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %5, align 16, !tbaa !97 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 + call void @llvm.memset.p0i8.i64(i8* %6, i8 0, i64 32, i32 8, i1 false) #2 + store i64 %7, i64* %8, align 16, !tbaa !97 + %vbase.offset.i.i22 = load i64, i64* %11, align 8 + %add.ptr.i.i23 = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i.i22 + %28 = bitcast i8* %add.ptr.i.i23 to i64* + store i64 %9, i64* %28, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %10, align 16, !tbaa !97 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %29 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %29, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %2, i64 %vbase.offset5.i.i + %30 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %30, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %5, align 16, !tbaa !97 + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %25, align 16, !tbaa !97 + call void @llvm.memset.p0i8.i64(i8* %13, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %12, align 8, !tbaa !97 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 + store %union.anon* %14, %union.anon** %15, align 8, !tbaa !109 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %10, align 16, !tbaa !97 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %31 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %31, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i + %32 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %32, %"class.std::basic_streambuf"* %16) #2 + %vtable.i24 = load i8*, i8** %10, align 16, !tbaa !97 + %vbase.offset.ptr.i25 = getelementptr i8, i8* %vtable.i24, i64 -24 + %33 = bitcast i8* %vbase.offset.ptr.i25 to i64* + %vbase.offset.i26 = load i64, i64* %33, align 8 + %add.ptr.i27 = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i26 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i27, i64 24 + %34 = bitcast i8* %_M_flags.i.i to i32* + %35 = load i32, i32* %34, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %35, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %34, align 4, !tbaa !111 + %conv.i = fpext float %27 to double + %call.i = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %17, double %conv.i) #2 + call void @llvm.lifetime.start(i64 32, i8* nonnull %18) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %36 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %37 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call9 = call i64 @fwrite(i8* %36, i64 1, i64 %37, %struct._IO_FILE* nonnull %call) + %fputc = call i32 @fputc(i32 10, %struct._IO_FILE* nonnull %call) + %38 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %cmp.i.i.i = icmp eq i8* %38, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %for.body + call void @_ZdlPv(i8* %38) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %for.body + call void @llvm.lifetime.end(i64 32, i8* nonnull %18) #2 + store i64 %20, i64* %8, align 16, !tbaa !97 + %vbase.offset.i.i = load i64, i64* %22, align 8 + %add.ptr.i.i = getelementptr inbounds i8, i8* %2, i64 %vbase.offset.i.i + %39 = bitcast i8* %add.ptr.i.i to i64* + store i64 %21, i64* %39, align 8, !tbaa !97 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %23, align 8, !tbaa !97 + %40 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %40, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i + +if.then.i.i.i.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %40) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %23, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* nonnull %24) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %2) #2 + %indvars.iv.next = add nuw i64 %indvars.iv, 1 + %41 = load i64, i64* bitcast (float** getelementptr inbounds (%"class.std::vector", %"class.std::vector"* @run_accuracies, i64 0, i32 0, i32 0, i32 1) to i64*), align 8, !tbaa !125 + %42 = load i64, i64* bitcast (%"class.std::vector"* @run_accuracies to i64*), align 8, !tbaa !52 + %sub.ptr.sub.i = sub i64 %41, %42 + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 2 + %cmp2 = icmp ult i64 %indvars.iv.next, %sub.ptr.div.i + br i1 %cmp2, label %for.body, label %if.end.loopexit + +if.end.loopexit: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + br label %if.end + +if.end: ; preds = %if.end.loopexit, %for.cond.preheader, %entry + %call11 = call i32 @fclose(%struct._IO_FILE* %call) + ret void +} + +; Function Attrs: nounwind uwtable +define float @_Z16readPSNRFromFilePKc(i8* nocapture readonly %file_name) local_unnamed_addr #3 { +entry: + %psnr = alloca float, align 4 + %0 = bitcast float* %psnr to i8* + call void @llvm.lifetime.start(i64 4, i8* nonnull %0) #2 + %call = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.30, i64 0, i64 0)) + %cmp = icmp eq %struct._IO_FILE* %call, null + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %puts = tail call i32 @puts(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @str.79, i64 0, i64 0)) + tail call void @abort() #13 + unreachable + +if.end: ; preds = %entry + %call2 = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* nonnull %call, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.32, i64 0, i64 0), float* nonnull %psnr) + %1 = load float, float* %psnr, align 4, !tbaa !71 + %conv = fpext float %1 to double + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.33, i64 0, i64 0), double %conv) + %2 = load float, float* %psnr, align 4, !tbaa !71 + call void @llvm.lifetime.end(i64 4, i8* nonnull %0) #2 + ret float %2 +} + +; Function Attrs: nounwind +declare i32 @fscanf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #1 + +; Function Attrs: nounwind uwtable +define float @_Z20computePSNRViolationPvS_f(i8* nocapture readonly %gold_ptr, i8* nocapture readonly %approx_ptr, float %PSNR_threshold) local_unnamed_addr #3 { +entry: + %psnr.i = alloca float, align 4 + %psnr_list.sroa.9 = alloca i64, align 8 + %psnr_list.sroa.13 = alloca float*, align 8 + %ss = alloca %"class.std::__cxx11::basic_ostringstream", align 16 + %print_str = alloca %"class.std::__cxx11::basic_string", align 8 + %0 = bitcast float* %psnr.i to i8* + call void @llvm.lifetime.start(i64 4, i8* nonnull %0) #2 + %call.i = tail call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.34, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.30, i64 0, i64 0)) #2 + %cmp.i = icmp eq %struct._IO_FILE* %call.i, null + br i1 %cmp.i, label %if.then.i, label %_Z16readPSNRFromFilePKc.exit + +if.then.i: ; preds = %entry + %puts.i = tail call i32 @puts(i8* getelementptr inbounds ([28 x i8], [28 x i8]* @str.79, i64 0, i64 0)) #2 + tail call void @abort() #13 + unreachable + +_Z16readPSNRFromFilePKc.exit: ; preds = %entry + %call2.i = call i32 (%struct._IO_FILE*, i8*, ...) @fscanf(%struct._IO_FILE* nonnull %call.i, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.32, i64 0, i64 0), float* nonnull %psnr.i) #2 + %1 = load float, float* %psnr.i, align 4, !tbaa !71 + %conv.i = fpext float %1 to double + %call3.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.33, i64 0, i64 0), double %conv.i) #2 + %2 = load float, float* %psnr.i, align 4, !tbaa !71 + call void @llvm.lifetime.end(i64 4, i8* nonnull %0) #2 + %psnr_list.sroa.9.0..sroa_cast174 = bitcast i64* %psnr_list.sroa.9 to i8* + call void @llvm.lifetime.start(i64 8, i8* nonnull %psnr_list.sroa.9.0..sroa_cast174) + %psnr_list.sroa.13.0..sroa_cast172 = bitcast float** %psnr_list.sroa.13 to i8* + call void @llvm.lifetime.start(i64 8, i8* nonnull %psnr_list.sroa.13.0..sroa_cast172) + store i64 0, i64* %psnr_list.sroa.9, align 8 + store float* null, float** %psnr_list.sroa.13, align 8 + %dim_sizes1 = getelementptr inbounds i8, i8* %gold_ptr, i64 96 + %3 = bitcast i8* %dim_sizes1 to i64** + %4 = load i64*, i64** %3, align 8, !tbaa !65 + %5 = load i64, i64* %4, align 8, !tbaa !66 + %arrayidx2 = getelementptr inbounds i64, i64* %4, i64 1 + %6 = load i64, i64* %arrayidx2, align 8, !tbaa !66 + %arrayidx3 = getelementptr inbounds i64, i64* %4, i64 2 + %7 = load i64, i64* %arrayidx3, align 8, !tbaa !66 + %mul = mul i64 %7, %6 + %arrayidx4 = getelementptr inbounds i64, i64* %4, i64 3 + %8 = load i64, i64* %arrayidx4, align 8, !tbaa !66 + %mul5 = mul i64 %mul, %8 + %call6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str.35, i64 0, i64 0), i64 %5, i64 %mul5) + %host_data = getelementptr inbounds i8, i8* %gold_ptr, i64 48 + %9 = bitcast i8* %host_data to float** + %10 = load float*, float** %9, align 8, !tbaa !68 + %host_data7 = getelementptr inbounds i8, i8* %approx_ptr, i64 48 + %11 = bitcast i8* %host_data7 to float** + %12 = load float*, float** %11, align 8, !tbaa !68 + %call8 = call %struct._IO_FILE* @fopen(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.36, i64 0, i64 0), i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %cmp187 = icmp eq i64 %5, 0 + br i1 %cmp187, label %for.cond.cleanup, label %for.body.lr.ph + +for.body.lr.ph: ; preds = %_Z16readPSNRFromFilePKc.exit + %cmp11182 = icmp eq i64 %mul5, 0 + %conv = uitofp i64 %mul5 to float + %13 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8* + %14 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2 + %15 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %14, i64 0, i32 0 + %16 = getelementptr inbounds %"class.std::basic_ios", %"class.std::basic_ios"* %14, i64 0, i32 0, i32 0 + %_M_tie.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 1 + %_M_fill.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 2 + %_M_fill_init.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 3 + %_M_streambuf.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 4 + %17 = bitcast %"class.std::basic_streambuf"** %_M_streambuf.i.i to i8* + %18 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 1) to i64*), align 8 + %19 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i64* + %20 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 2) to i64*), align 8 + %21 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to i8** + %vtable.cast.i.i = inttoptr i64 %18 to i8* + %vbase.offset.ptr.i.i = getelementptr i8, i8* %vtable.cast.i.i, i64 -24 + %22 = bitcast i8* %vbase.offset.ptr.i.i to i64* + %_M_stringbuf.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1 + %23 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0, i32 0 + %_M_in_beg.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 1 + %_M_buf_locale.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 7 + %24 = bitcast i8** %_M_in_beg.i.i.i to i8* + %_M_mode.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 1 + %_M_string.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2 + %25 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 2 + %26 = bitcast %"class.std::__cxx11::basic_string"* %_M_string.i.i to %union.anon** + %_M_string_length.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 1 + %.cast.i.i.i = bitcast %union.anon* %25 to i8* + %27 = getelementptr inbounds %"class.std::__cxx11::basic_stringbuf", %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i, i64 0, i32 0 + %28 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to %"class.std::basic_ostream"* + %29 = bitcast %"class.std::__cxx11::basic_string"* %print_str to i8* + %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 0, i32 0 + %_M_string_length.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 1 + %30 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %print_str, i64 0, i32 2 + %arraydecay.i.i.i.i = bitcast %union.anon* %30 to i8* + %31 = load i64, i64* bitcast ([4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE to i64*), align 8 + %32 = load i64, i64* bitcast (i8** getelementptr inbounds ([4 x i8*], [4 x i8*]* @_ZTTNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, i64 3) to i64*), align 8 + %vtable.cast.i.i153 = inttoptr i64 %31 to i8* + %vbase.offset.ptr.i.i154 = getelementptr i8, i8* %vtable.cast.i.i153, i64 -24 + %33 = bitcast i8* %vbase.offset.ptr.i.i154 to i64* + %34 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 0, i32 0 + %_M_p.i.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 1, i32 2, i32 0, i32 0 + %35 = getelementptr inbounds %"class.std::__cxx11::basic_ostringstream", %"class.std::__cxx11::basic_ostringstream"* %ss, i64 0, i32 2, i32 0 + %psnr_list.sroa.9.0._M_finish.i.sroa_cast = bitcast i64* %psnr_list.sroa.9 to float** + %36 = add i64 %mul5, -8 + %37 = lshr i64 %36, 3 + %38 = bitcast %"class.std::__cxx11::basic_ostringstream"* %ss to <2 x i32 (...)**>* + %min.iters.check = icmp ult i64 %mul5, 8 + %n.vec = and i64 %mul5, -8 + %cmp.zero = icmp eq i64 %n.vec, 0 + %39 = and i64 %37, 1 + %lcmp.mod246 = icmp eq i64 %39, 0 + %40 = icmp eq i64 %37, 0 + %cmp.n = icmp eq i64 %mul5, %n.vec + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + %phitmp = sitofp i32 %num_errors.1 to double + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %_Z16readPSNRFromFilePKc.exit + %psnr_list.sroa.0.0.lcssa = phi i64 [ 0, %_Z16readPSNRFromFilePKc.exit ], [ %psnr_list.sroa.0.1, %for.cond.cleanup.loopexit ] + %num_errors.0.lcssa = phi double [ 0.000000e+00, %_Z16readPSNRFromFilePKc.exit ], [ %phitmp, %for.cond.cleanup.loopexit ] + %sum_psnr.0.lcssa = phi float [ 0.000000e+00, %_Z16readPSNRFromFilePKc.exit ], [ %add28, %for.cond.cleanup.loopexit ] + %conv46 = uitofp i64 %5 to double + %div47 = fdiv fast double %num_errors.0.lcssa, %conv46 + %mul48 = fmul fast double %div47, 1.000000e+02 + %conv49 = fptrunc double %mul48 to float + %conv50 = fpext float %conv49 to double + %call51 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.38, i64 0, i64 0), double %conv50) + %conv52 = uitofp i64 %5 to float + %div53 = fdiv fast float %sum_psnr.0.lcssa, %conv52 + %conv54 = fpext float %div53 to double + %call55 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* @.str.39, i64 0, i64 0), double %conv54) + call void @_Z11dumpAvgPSNRf(float %div53) + %conv58 = fsub fast float 1.000000e+02, %conv49 + call void @_Z17dumpFinalAccuracyf(float %conv58) + %call59 = call i32 @fclose(%struct._IO_FILE* %call8) + br i1 %cmp187, label %for.cond.cleanup63, label %for.body64.lr.ph + +for.body64.lr.ph: ; preds = %for.cond.cleanup + %41 = inttoptr i64 %psnr_list.sroa.0.0.lcssa to float* + %min.iters.check213 = icmp ult i64 %5, 8 + br i1 %min.iters.check213, label %for.body64.preheader, label %min.iters.checked214 + +for.body64.preheader: ; preds = %middle.block211, %min.iters.checked214, %for.body64.lr.ph + %i60.0181.ph = phi i64 [ 0, %min.iters.checked214 ], [ 0, %for.body64.lr.ph ], [ %n.vec216, %middle.block211 ] + %var.0180.ph = phi float [ 0.000000e+00, %min.iters.checked214 ], [ 0.000000e+00, %for.body64.lr.ph ], [ %74, %middle.block211 ] + br label %for.body64 + +min.iters.checked214: ; preds = %for.body64.lr.ph + %n.vec216 = and i64 %5, -8 + %cmp.zero217 = icmp eq i64 %n.vec216, 0 + br i1 %cmp.zero217, label %for.body64.preheader, label %vector.ph218 + +vector.ph218: ; preds = %min.iters.checked214 + %broadcast.splatinsert231 = insertelement <4 x float> undef, float %div53, i32 0 + %broadcast.splat232 = shufflevector <4 x float> %broadcast.splatinsert231, <4 x float> undef, <4 x i32> zeroinitializer + %42 = add i64 %n.vec216, -8 + %43 = lshr exact i64 %42, 3 + %44 = and i64 %43, 1 + %lcmp.mod = icmp eq i64 %44, 0 + br i1 %lcmp.mod, label %vector.body210.prol.preheader, label %vector.body210.prol.loopexit + +vector.body210.prol.preheader: ; preds = %vector.ph218 + br label %vector.body210.prol + +vector.body210.prol: ; preds = %vector.body210.prol.preheader + %45 = inttoptr i64 %psnr_list.sroa.0.0.lcssa to <4 x float>* + %wide.load229.prol = load <4 x float>, <4 x float>* %45, align 4, !tbaa !71 + %46 = getelementptr float, float* %41, i64 4 + %47 = bitcast float* %46 to <4 x float>* + %wide.load230.prol = load <4 x float>, <4 x float>* %47, align 4, !tbaa !71 + %48 = fsub fast <4 x float> %wide.load229.prol, %broadcast.splat232 + %49 = fsub fast <4 x float> %wide.load230.prol, %broadcast.splat232 + %50 = fmul fast <4 x float> %48, %48 + %51 = fmul fast <4 x float> %49, %49 + br label %vector.body210.prol.loopexit + +vector.body210.prol.loopexit: ; preds = %vector.body210.prol, %vector.ph218 + %.lcssa240.unr = phi <4 x float> [ undef, %vector.ph218 ], [ %50, %vector.body210.prol ] + %.lcssa.unr = phi <4 x float> [ undef, %vector.ph218 ], [ %51, %vector.body210.prol ] + %index219.unr = phi i64 [ 0, %vector.ph218 ], [ 8, %vector.body210.prol ] + %vec.phi227.unr = phi <4 x float> [ zeroinitializer, %vector.ph218 ], [ %50, %vector.body210.prol ] + %vec.phi228.unr = phi <4 x float> [ zeroinitializer, %vector.ph218 ], [ %51, %vector.body210.prol ] + %52 = icmp eq i64 %43, 0 + br i1 %52, label %middle.block211, label %vector.ph218.new + +vector.ph218.new: ; preds = %vector.body210.prol.loopexit + br label %vector.body210 + +vector.body210: ; preds = %vector.body210, %vector.ph218.new + %index219 = phi i64 [ %index219.unr, %vector.ph218.new ], [ %index.next220.1, %vector.body210 ] + %vec.phi227 = phi <4 x float> [ %vec.phi227.unr, %vector.ph218.new ], [ %71, %vector.body210 ] + %vec.phi228 = phi <4 x float> [ %vec.phi228.unr, %vector.ph218.new ], [ %72, %vector.body210 ] + %53 = getelementptr inbounds float, float* %41, i64 %index219 + %54 = bitcast float* %53 to <4 x float>* + %wide.load229 = load <4 x float>, <4 x float>* %54, align 4, !tbaa !71 + %55 = getelementptr float, float* %53, i64 4 + %56 = bitcast float* %55 to <4 x float>* + %wide.load230 = load <4 x float>, <4 x float>* %56, align 4, !tbaa !71 + %57 = fsub fast <4 x float> %wide.load229, %broadcast.splat232 + %58 = fsub fast <4 x float> %wide.load230, %broadcast.splat232 + %59 = fmul fast <4 x float> %57, %57 + %60 = fmul fast <4 x float> %58, %58 + %61 = fadd fast <4 x float> %59, %vec.phi227 + %62 = fadd fast <4 x float> %60, %vec.phi228 + %index.next220 = add i64 %index219, 8 + %63 = getelementptr inbounds float, float* %41, i64 %index.next220 + %64 = bitcast float* %63 to <4 x float>* + %wide.load229.1 = load <4 x float>, <4 x float>* %64, align 4, !tbaa !71 + %65 = getelementptr float, float* %63, i64 4 + %66 = bitcast float* %65 to <4 x float>* + %wide.load230.1 = load <4 x float>, <4 x float>* %66, align 4, !tbaa !71 + %67 = fsub fast <4 x float> %wide.load229.1, %broadcast.splat232 + %68 = fsub fast <4 x float> %wide.load230.1, %broadcast.splat232 + %69 = fmul fast <4 x float> %67, %67 + %70 = fmul fast <4 x float> %68, %68 + %71 = fadd fast <4 x float> %69, %61 + %72 = fadd fast <4 x float> %70, %62 + %index.next220.1 = add i64 %index219, 16 + %73 = icmp eq i64 %index.next220.1, %n.vec216 + br i1 %73, label %middle.block211.unr-lcssa, label %vector.body210, !llvm.loop !127 + +middle.block211.unr-lcssa: ; preds = %vector.body210 + br label %middle.block211 + +middle.block211: ; preds = %middle.block211.unr-lcssa, %vector.body210.prol.loopexit + %.lcssa240 = phi <4 x float> [ %.lcssa240.unr, %vector.body210.prol.loopexit ], [ %71, %middle.block211.unr-lcssa ] + %.lcssa = phi <4 x float> [ %.lcssa.unr, %vector.body210.prol.loopexit ], [ %72, %middle.block211.unr-lcssa ] + %bin.rdx233 = fadd fast <4 x float> %.lcssa, %.lcssa240 + %rdx.shuf234 = shufflevector <4 x float> %bin.rdx233, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx235 = fadd fast <4 x float> %bin.rdx233, %rdx.shuf234 + %rdx.shuf236 = shufflevector <4 x float> %bin.rdx235, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx237 = fadd fast <4 x float> %bin.rdx235, %rdx.shuf236 + %74 = extractelement <4 x float> %bin.rdx237, i32 0 + %cmp.n222 = icmp eq i64 %5, %n.vec216 + br i1 %cmp.n222, label %for.cond.cleanup63, label %for.body64.preheader + +for.body: ; preds = %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, %for.body.lr.ph + %sum_psnr.0191 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add28, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %num_errors.0190 = phi i32 [ 0, %for.body.lr.ph ], [ %num_errors.1, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %i.0189 = phi i64 [ 0, %for.body.lr.ph ], [ %inc42, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %psnr_list.sroa.0.0188 = phi i64 [ 0, %for.body.lr.ph ], [ %psnr_list.sroa.0.1, %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit ] + %mul9 = mul i64 %i.0189, %mul5 + br i1 %cmp11182, label %for.cond.cleanup12, label %for.body13.preheader + +for.body13.preheader: ; preds = %for.body + br i1 %min.iters.check, label %for.body13.preheader239, label %min.iters.checked + +for.body13.preheader239: ; preds = %middle.block, %min.iters.checked, %for.body13.preheader + %mse_sum.0185.ph = phi float [ 0.000000e+00, %min.iters.checked ], [ 0.000000e+00, %for.body13.preheader ], [ %118, %middle.block ] + %j.0184.ph = phi i64 [ 0, %min.iters.checked ], [ 0, %for.body13.preheader ], [ %n.vec, %middle.block ] + br label %for.body13 + +min.iters.checked: ; preds = %for.body13.preheader + br i1 %cmp.zero, label %for.body13.preheader239, label %vector.body.preheader + +vector.body.preheader: ; preds = %min.iters.checked + br i1 %lcmp.mod246, label %vector.body.prol.preheader, label %vector.body.prol.loopexit.unr-lcssa + +vector.body.prol.preheader: ; preds = %vector.body.preheader + br label %vector.body.prol + +vector.body.prol: ; preds = %vector.body.prol.preheader + %75 = getelementptr inbounds float, float* %10, i64 %mul9 + %76 = bitcast float* %75 to <4 x float>* + %wide.load.prol = load <4 x float>, <4 x float>* %76, align 4, !tbaa !71 + %77 = getelementptr float, float* %75, i64 4 + %78 = bitcast float* %77 to <4 x float>* + %wide.load204.prol = load <4 x float>, <4 x float>* %78, align 4, !tbaa !71 + %79 = getelementptr inbounds float, float* %12, i64 %mul9 + %80 = bitcast float* %79 to <4 x float>* + %wide.load205.prol = load <4 x float>, <4 x float>* %80, align 4, !tbaa !71 + %81 = getelementptr float, float* %79, i64 4 + %82 = bitcast float* %81 to <4 x float>* + %wide.load206.prol = load <4 x float>, <4 x float>* %82, align 4, !tbaa !71 + %83 = fsub fast <4 x float> %wide.load.prol, %wide.load205.prol + %84 = fsub fast <4 x float> %wide.load204.prol, %wide.load206.prol + %85 = fmul fast <4 x float> %83, %83 + %86 = fmul fast <4 x float> %84, %84 + br label %vector.body.prol.loopexit.unr-lcssa + +vector.body.prol.loopexit.unr-lcssa: ; preds = %vector.body.prol, %vector.body.preheader + %.lcssa242.unr.ph = phi <4 x float> [ %85, %vector.body.prol ], [ undef, %vector.body.preheader ] + %.lcssa241.unr.ph = phi <4 x float> [ %86, %vector.body.prol ], [ undef, %vector.body.preheader ] + %index.unr.ph = phi i64 [ 8, %vector.body.prol ], [ 0, %vector.body.preheader ] + %vec.phi.unr.ph = phi <4 x float> [ %85, %vector.body.prol ], [ zeroinitializer, %vector.body.preheader ] + %vec.phi202.unr.ph = phi <4 x float> [ %86, %vector.body.prol ], [ zeroinitializer, %vector.body.preheader ] + br label %vector.body.prol.loopexit + +vector.body.prol.loopexit: ; preds = %vector.body.prol.loopexit.unr-lcssa + br i1 %40, label %middle.block, label %vector.body.preheader.new + +vector.body.preheader.new: ; preds = %vector.body.prol.loopexit + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.body.preheader.new + %index = phi i64 [ %index.unr.ph, %vector.body.preheader.new ], [ %index.next.1, %vector.body ] + %vec.phi = phi <4 x float> [ %vec.phi.unr.ph, %vector.body.preheader.new ], [ %115, %vector.body ] + %vec.phi202 = phi <4 x float> [ %vec.phi202.unr.ph, %vector.body.preheader.new ], [ %116, %vector.body ] + %87 = add i64 %index, %mul9 + %88 = getelementptr inbounds float, float* %10, i64 %87 + %89 = bitcast float* %88 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %89, align 4, !tbaa !71 + %90 = getelementptr float, float* %88, i64 4 + %91 = bitcast float* %90 to <4 x float>* + %wide.load204 = load <4 x float>, <4 x float>* %91, align 4, !tbaa !71 + %92 = getelementptr inbounds float, float* %12, i64 %87 + %93 = bitcast float* %92 to <4 x float>* + %wide.load205 = load <4 x float>, <4 x float>* %93, align 4, !tbaa !71 + %94 = getelementptr float, float* %92, i64 4 + %95 = bitcast float* %94 to <4 x float>* + %wide.load206 = load <4 x float>, <4 x float>* %95, align 4, !tbaa !71 + %96 = fsub fast <4 x float> %wide.load, %wide.load205 + %97 = fsub fast <4 x float> %wide.load204, %wide.load206 + %98 = fmul fast <4 x float> %96, %96 + %99 = fmul fast <4 x float> %97, %97 + %100 = fadd fast <4 x float> %98, %vec.phi + %101 = fadd fast <4 x float> %99, %vec.phi202 + %index.next = add i64 %index, 8 + %102 = add i64 %index.next, %mul9 + %103 = getelementptr inbounds float, float* %10, i64 %102 + %104 = bitcast float* %103 to <4 x float>* + %wide.load.1 = load <4 x float>, <4 x float>* %104, align 4, !tbaa !71 + %105 = getelementptr float, float* %103, i64 4 + %106 = bitcast float* %105 to <4 x float>* + %wide.load204.1 = load <4 x float>, <4 x float>* %106, align 4, !tbaa !71 + %107 = getelementptr inbounds float, float* %12, i64 %102 + %108 = bitcast float* %107 to <4 x float>* + %wide.load205.1 = load <4 x float>, <4 x float>* %108, align 4, !tbaa !71 + %109 = getelementptr float, float* %107, i64 4 + %110 = bitcast float* %109 to <4 x float>* + %wide.load206.1 = load <4 x float>, <4 x float>* %110, align 4, !tbaa !71 + %111 = fsub fast <4 x float> %wide.load.1, %wide.load205.1 + %112 = fsub fast <4 x float> %wide.load204.1, %wide.load206.1 + %113 = fmul fast <4 x float> %111, %111 + %114 = fmul fast <4 x float> %112, %112 + %115 = fadd fast <4 x float> %113, %100 + %116 = fadd fast <4 x float> %114, %101 + %index.next.1 = add i64 %index, 16 + %117 = icmp eq i64 %index.next.1, %n.vec + br i1 %117, label %middle.block.unr-lcssa, label %vector.body, !llvm.loop !128 + +middle.block.unr-lcssa: ; preds = %vector.body + br label %middle.block + +middle.block: ; preds = %middle.block.unr-lcssa, %vector.body.prol.loopexit + %.lcssa242 = phi <4 x float> [ %.lcssa242.unr.ph, %vector.body.prol.loopexit ], [ %115, %middle.block.unr-lcssa ] + %.lcssa241 = phi <4 x float> [ %.lcssa241.unr.ph, %vector.body.prol.loopexit ], [ %116, %middle.block.unr-lcssa ] + %bin.rdx = fadd fast <4 x float> %.lcssa241, %.lcssa242 + %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> + %bin.rdx207 = fadd fast <4 x float> %bin.rdx, %rdx.shuf + %rdx.shuf208 = shufflevector <4 x float> %bin.rdx207, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> + %bin.rdx209 = fadd fast <4 x float> %bin.rdx207, %rdx.shuf208 + %118 = extractelement <4 x float> %bin.rdx209, i32 0 + br i1 %cmp.n, label %for.cond.cleanup12, label %for.body13.preheader239 + +for.cond.cleanup12.loopexit: ; preds = %for.body13 + br label %for.cond.cleanup12 + +for.cond.cleanup12: ; preds = %for.cond.cleanup12.loopexit, %middle.block, %for.body + %mse_sum.0.lcssa = phi float [ 0.000000e+00, %for.body ], [ %118, %middle.block ], [ %add18, %for.cond.cleanup12.loopexit ] + %div = fdiv fast float %mse_sum.0.lcssa, %conv + %call.i141 = call fast float @sqrtf(float %div) #12 + %div25 = fdiv fast float 2.550000e+02, %call.i141 + %call.i142 = call fast float @log10f(float %div25) #12 + %mul27 = fmul fast float %call.i142, 2.000000e+01 + %add28 = fadd fast float %mul27, %sum_psnr.0191 + %cmp29 = fcmp fast olt float %mul27, %2 + %add31 = zext i1 %cmp29 to i32 + %num_errors.1 = add nsw i32 %add31, %num_errors.0190 + %conv33 = fpext float %mul27 to double + %call34 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str.37, i64 0, i64 0), double %conv33) + %psnr_list.sroa.9.0.psnr_list.sroa.9.8. = load i64, i64* %psnr_list.sroa.9, align 8 + %119 = inttoptr i64 %psnr_list.sroa.9.0.psnr_list.sroa.9.8. to float* + %psnr_list.sroa.13.0.psnr_list.sroa.13.16. = load float*, float** %psnr_list.sroa.13, align 8 + %cmp.i143 = icmp eq float* %119, %psnr_list.sroa.13.0.psnr_list.sroa.13.16. + br i1 %cmp.i143, label %if.else.i, label %if.then.i144 + +if.then.i144: ; preds = %for.cond.cleanup12 + store float %mul27, float* %119, align 4, !tbaa !71 + %incdec.ptr.i = getelementptr inbounds float, float* %119, i64 1 + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +if.else.i: ; preds = %for.cond.cleanup12 + %sub.ptr.sub.i21.i.i.i = sub i64 %psnr_list.sroa.9.0.psnr_list.sroa.9.8., %psnr_list.sroa.0.0188 + %sub.ptr.div.i22.i.i.i = ashr exact i64 %sub.ptr.sub.i21.i.i.i, 2 + %cmp.i.i.i.i = icmp eq i64 %sub.ptr.div.i22.i.i.i, 0 + %.sroa.speculated.i.i.i = select i1 %cmp.i.i.i.i, i64 1, i64 %sub.ptr.div.i22.i.i.i + %add.i.i.i = add nsw i64 %.sroa.speculated.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp7.i.i.i = icmp ult i64 %add.i.i.i, %sub.ptr.div.i22.i.i.i + %cmp9.i.i.i = icmp ugt i64 %add.i.i.i, 4611686018427387903 + %or.cond.i.i.i = or i1 %cmp7.i.i.i, %cmp9.i.i.i + %cond.i.i.i = select i1 %or.cond.i.i.i, i64 4611686018427387903, i64 %add.i.i.i + %cmp.i35.i.i = icmp eq i64 %cond.i.i.i, 0 + br i1 %cmp.i35.i.i, label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i, label %cond.true.i.i.i + +cond.true.i.i.i: ; preds = %if.else.i + %cmp.i.i.i.i.i = icmp ugt i64 %cond.i.i.i, 4611686018427387903 + br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i + +if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i + call void @_ZSt17__throw_bad_allocv() #13 + unreachable + +_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i: ; preds = %cond.true.i.i.i + %mul.i.i.i.i.i = shl i64 %cond.i.i.i, 2 + %call2.i.i.i.i.i = call i8* @_Znwm(i64 %mul.i.i.i.i.i) #2 + %120 = bitcast i8* %call2.i.i.i.i.i to float* + br label %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + +_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i: ; preds = %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i, %if.else.i + %121 = phi i8* [ %call2.i.i.i.i.i, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %cond.i36.i.i = phi float* [ %120, %_ZNSt16allocator_traitsISaIfEE8allocateERS0_m.exit.i.i.i ], [ null, %if.else.i ] + %add.ptr.i.i = getelementptr inbounds float, float* %cond.i36.i.i, i64 %sub.ptr.div.i22.i.i.i + store float %mul27, float* %add.ptr.i.i, align 4, !tbaa !71 + br i1 %cmp.i.i.i.i, label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i, label %if.then.i.i.i.i.i.i.i.i.i.i + +if.then.i.i.i.i.i.i.i.i.i.i: ; preds = %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %122 = inttoptr i64 %psnr_list.sroa.0.0188 to i8* + call void @llvm.memmove.p0i8.p0i8.i64(i8* %121, i8* %122, i64 %sub.ptr.sub.i21.i.i.i, i32 4, i1 false) #2 + br label %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + +_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i: ; preds = %if.then.i.i.i.i.i.i.i.i.i.i, %_ZNSt12_Vector_baseIfSaIfEE11_M_allocateEm.exit.i.i + %incdec.ptr.i.i = getelementptr inbounds float, float* %add.ptr.i.i, i64 1 + %tobool.i.i.i = icmp eq i64 %psnr_list.sroa.0.0188, 0 + br i1 %tobool.i.i.i, label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i, label %if.then.i37.i.i + +if.then.i37.i.i: ; preds = %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + %123 = inttoptr i64 %psnr_list.sroa.0.0188 to i8* + call void @_ZdlPv(i8* %123) #2 + br label %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i + +_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i: ; preds = %if.then.i37.i.i, %_ZSt34__uninitialized_move_if_noexcept_aIPfS0_SaIfEET0_T_S3_S2_RT1_.exit.i.i + %124 = ptrtoint i8* %121 to i64 + %125 = ptrtoint float* %incdec.ptr.i.i to i64 + store i64 %125, i64* %psnr_list.sroa.9, align 8 + %add.ptr23.i.i = getelementptr inbounds float, float* %cond.i36.i.i, i64 %cond.i.i.i + br label %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + +_ZNSt6vectorIfSaIfEE9push_backERKf.exit: ; preds = %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i, %if.then.i144 + %psnr_list.sroa.0.1 = phi i64 [ %124, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %psnr_list.sroa.0.0188, %if.then.i144 ] + %_M_end_of_storage.sink.i = phi float** [ %psnr_list.sroa.13, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %psnr_list.sroa.9.0._M_finish.i.sroa_cast, %if.then.i144 ] + %add.ptr23.i.sink.i = phi float* [ %add.ptr23.i.i, %_ZNSt6vectorIfSaIfEE19_M_emplace_back_auxIJRKfEEEvDpOT_.exit.i ], [ %incdec.ptr.i, %if.then.i144 ] + store float* %add.ptr23.i.sink.i, float** %_M_end_of_storage.sink.i, align 8, !tbaa !124 + call void @llvm.lifetime.start(i64 376, i8* nonnull %13) #2 + call void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"* %15) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTVSt9basic_iosIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %16, align 16, !tbaa !97 + store %"class.std::basic_ostream"* null, %"class.std::basic_ostream"** %_M_tie.i.i, align 8, !tbaa !99 + store i8 0, i8* %_M_fill.i.i, align 16, !tbaa !102 + store i8 0, i8* %_M_fill_init.i.i, align 1, !tbaa !103 + call void @llvm.memset.p0i8.i64(i8* %17, i8 0, i64 32, i32 8, i1 false) #2 + store i64 %18, i64* %19, align 16, !tbaa !97 + %vbase.offset.i.i = load i64, i64* %22, align 8 + %add.ptr.i.i145 = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i.i + %126 = bitcast i8* %add.ptr.i.i145 to i64* + store i64 %20, i64* %126, align 8, !tbaa !97 + %vtable3.i.i = load i8*, i8** %21, align 16, !tbaa !97 + %vbase.offset.ptr4.i.i = getelementptr i8, i8* %vtable3.i.i, i64 -24 + %127 = bitcast i8* %vbase.offset.ptr4.i.i to i64* + %vbase.offset5.i.i = load i64, i64* %127, align 8 + %add.ptr6.i.i = getelementptr inbounds i8, i8* %13, i64 %vbase.offset5.i.i + %128 = bitcast i8* %add.ptr6.i.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %128, %"class.std::basic_streambuf"* null) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 1, i64 3) to i32 (...)**), i32 (...)*** %16, align 16, !tbaa !97 + store <2 x i32 (...)**> <i32 (...)** bitcast (i8** getelementptr inbounds ({ [5 x i8*], [5 x i8*] }, { [5 x i8*], [5 x i8*] }* @_ZTVNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 3) to i32 (...)**), i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**)>, <2 x i32 (...)**>* %38, align 16, !tbaa !97 + call void @llvm.memset.p0i8.i64(i8* %24, i8 0, i64 48, i32 8, i1 false) #2 + call void @_ZNSt6localeC1Ev(%"class.std::locale"* %_M_buf_locale.i.i.i) #2 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %23, align 8, !tbaa !97 + store i32 16, i32* %_M_mode.i.i, align 8, !tbaa !104 + store %union.anon* %25, %union.anon** %26, align 8, !tbaa !109 + store i64 0, i64* %_M_string_length.i.i.i.i.i, align 8, !tbaa !110 + store i8 0, i8* %.cast.i.i.i, align 8, !tbaa !93 + %vtable.i = load i8*, i8** %21, align 16, !tbaa !97 + %vbase.offset.ptr.i = getelementptr i8, i8* %vtable.i, i64 -24 + %129 = bitcast i8* %vbase.offset.ptr.i to i64* + %vbase.offset.i = load i64, i64* %129, align 8 + %add.ptr2.i = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i + %130 = bitcast i8* %add.ptr2.i to %"class.std::basic_ios"* + call void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"* %130, %"class.std::basic_streambuf"* %27) #2 + %vtable.i146 = load i8*, i8** %21, align 16, !tbaa !97 + %vbase.offset.ptr.i147 = getelementptr i8, i8* %vtable.i146, i64 -24 + %131 = bitcast i8* %vbase.offset.ptr.i147 to i64* + %vbase.offset.i148 = load i64, i64* %131, align 8 + %add.ptr.i = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i148 + %_M_flags.i.i = getelementptr inbounds i8, i8* %add.ptr.i, i64 24 + %132 = bitcast i8* %_M_flags.i.i to i32* + %133 = load i32, i32* %132, align 8, !tbaa !116 + %and.i.i.i.i = and i32 %133, -261 + %or.i.i.i.i = or i32 %and.i.i.i.i, 4 + store i32 %or.i.i.i.i, i32* %132, align 4, !tbaa !111 + %call.i151 = call dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"* nonnull %28, double %conv33) #2 + call void @llvm.lifetime.start(i64 32, i8* nonnull %29) #2 + call void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* nonnull sret %print_str, %"class.std::__cxx11::basic_stringbuf"* %_M_stringbuf.i) #2 + %134 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %135 = load i64, i64* %_M_string_length.i, align 8, !tbaa !110 + %call39 = call i64 @fwrite(i8* %134, i64 1, i64 %135, %struct._IO_FILE* %call8) + %fputc = call i32 @fputc(i32 10, %struct._IO_FILE* %call8) + %136 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %cmp.i.i.i = icmp eq i8* %136, %arraydecay.i.i.i.i + br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i + +if.then.i.i: ; preds = %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + call void @_ZdlPv(i8* %136) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %_ZNSt6vectorIfSaIfEE9push_backERKf.exit + call void @llvm.lifetime.end(i64 32, i8* nonnull %29) #2 + store i64 %31, i64* %19, align 16, !tbaa !97 + %vbase.offset.i.i155 = load i64, i64* %33, align 8 + %add.ptr.i.i156 = getelementptr inbounds i8, i8* %13, i64 %vbase.offset.i.i155 + %137 = bitcast i8* %add.ptr.i.i156 to i64* + store i64 %32, i64* %137, align 8, !tbaa !97 + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + %138 = load i8*, i8** %_M_p.i.i.i.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i.i.i.i = icmp eq i8* %138, %.cast.i.i.i + br i1 %cmp.i.i.i.i.i.i, label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit, label %if.then.i.i.i.i.i157 + +if.then.i.i.i.i.i157: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + call void @_ZdlPv(i8* %138) #2 + br label %_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit + +_ZNSt7__cxx1119basic_ostringstreamIcSt11char_traitsIcESaIcEED1Ev.exit: ; preds = %if.then.i.i.i.i.i157, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit + store i32 (...)** bitcast (i8** getelementptr inbounds ({ [16 x i8*] }, { [16 x i8*] }* @_ZTVSt15basic_streambufIcSt11char_traitsIcEE, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %34, align 8, !tbaa !97 + call void @_ZNSt6localeD1Ev(%"class.std::locale"* nonnull %_M_buf_locale.i.i.i) #2 + call void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"* nonnull %35) #2 + call void @llvm.lifetime.end(i64 376, i8* nonnull %13) #2 + %inc42 = add nuw i64 %i.0189, 1 + %cmp = icmp ult i64 %inc42, %5 + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit + +for.body13: ; preds = %for.body13, %for.body13.preheader239 + %mse_sum.0185 = phi float [ %add18, %for.body13 ], [ %mse_sum.0185.ph, %for.body13.preheader239 ] + %j.0184 = phi i64 [ %inc, %for.body13 ], [ %j.0184.ph, %for.body13.preheader239 ] + %add = add i64 %j.0184, %mul9 + %arrayidx14 = getelementptr inbounds float, float* %10, i64 %add + %139 = load float, float* %arrayidx14, align 4, !tbaa !71 + %arrayidx16 = getelementptr inbounds float, float* %12, i64 %add + %140 = load float, float* %arrayidx16, align 4, !tbaa !71 + %sub = fsub fast float %139, %140 + %mul17 = fmul fast float %sub, %sub + %add18 = fadd fast float %mul17, %mse_sum.0185 + %inc = add nuw i64 %j.0184, 1 + %exitcond197 = icmp eq i64 %inc, %mul5 + br i1 %exitcond197, label %for.cond.cleanup12.loopexit, label %for.body13, !llvm.loop !129 + +for.cond.cleanup63.loopexit: ; preds = %for.body64 + br label %for.cond.cleanup63 + +for.cond.cleanup63: ; preds = %for.cond.cleanup63.loopexit, %middle.block211, %for.cond.cleanup + %var.0.lcssa = phi float [ 0.000000e+00, %for.cond.cleanup ], [ %74, %middle.block211 ], [ %add70, %for.cond.cleanup63.loopexit ] + %div75 = fdiv fast float %var.0.lcssa, %conv52 + %call.i158 = call fast float @sqrtf(float %div75) #12 + call void @_Z11dumpPSNRStdf(float %call.i158) + %tobool.i.i.i159 = icmp eq i64 %psnr_list.sroa.0.0.lcssa, 0 + br i1 %tobool.i.i.i159, label %_ZNSt6vectorIfSaIfEED2Ev.exit, label %if.then.i.i.i + +if.then.i.i.i: ; preds = %for.cond.cleanup63 + %141 = inttoptr i64 %psnr_list.sroa.0.0.lcssa to i8* + call void @_ZdlPv(i8* %141) #2 + br label %_ZNSt6vectorIfSaIfEED2Ev.exit + +_ZNSt6vectorIfSaIfEED2Ev.exit: ; preds = %if.then.i.i.i, %for.cond.cleanup63 + call void @llvm.lifetime.end(i64 8, i8* nonnull %psnr_list.sroa.9.0..sroa_cast174) + call void @llvm.lifetime.end(i64 8, i8* nonnull %psnr_list.sroa.13.0..sroa_cast172) + ret float %conv49 + +for.body64: ; preds = %for.body64, %for.body64.preheader + %i60.0181 = phi i64 [ %inc72, %for.body64 ], [ %i60.0181.ph, %for.body64.preheader ] + %var.0180 = phi float [ %add70, %for.body64 ], [ %var.0180.ph, %for.body64.preheader ] + %add.ptr.i160 = getelementptr inbounds float, float* %41, i64 %i60.0181 + %142 = load float, float* %add.ptr.i160, align 4, !tbaa !71 + %sub66 = fsub fast float %142, %div53 + %mul69 = fmul fast float %sub66, %sub66 + %add70 = fadd fast float %mul69, %var.0180 + %inc72 = add nuw i64 %i60.0181, 1 + %exitcond = icmp eq i64 %inc72, %5 + br i1 %exitcond, label %for.cond.cleanup63.loopexit, label %for.body64, !llvm.loop !130 +} + +; Function Attrs: nounwind uwtable +define void @_Z10dumpOutputPvPKc(i8* nocapture readonly %output_ptr, i8* nocapture readonly %file_name) local_unnamed_addr #3 { +entry: + %size_in_bytes1 = getelementptr inbounds i8, i8* %output_ptr, i64 80 + %0 = bitcast i8* %size_in_bytes1 to i64* + %1 = load i64, i64* %0, align 8, !tbaa !69 + %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str.40, i64 0, i64 0), i64 %1) + %host_data2 = getelementptr inbounds i8, i8* %output_ptr, i64 48 + %2 = bitcast i8* %host_data2 to i8** + %3 = load i8*, i8** %2, align 8, !tbaa !68 + %call3 = tail call %struct._IO_FILE* @fopen(i8* %file_name, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.23, i64 0, i64 0)) + %call4 = tail call i64 @fwrite(i8* %3, i64 1, i64 %1, %struct._IO_FILE* %call3) + %call5 = tail call i32 @fclose(%struct._IO_FILE* %call3) ret void } ; Function Attrs: norecurse nounwind uwtable -define i32 @main() local_unnamed_addr #5 { +define i32 @main() local_unnamed_addr #6 { entry: %__dnew.i.i.i.i = alloca i64, align 8 %dir_prefix = alloca %"class.std::__cxx11::basic_string", align 8 @@ -2032,2655 +4747,3123 @@ entry: %dense_2_b_path = alloca %"class.std::__cxx11::basic_string", align 8 %ref.tmp120 = alloca %"class.std::__cxx11::basic_string", align 8 %0 = bitcast %"class.std::__cxx11::basic_string"* %dir_prefix to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %0) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %0) #2 %1 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 2 %2 = bitcast %"class.std::__cxx11::basic_string"* %dir_prefix to %union.anon** - store %union.anon* %1, %union.anon** %2, align 8, !tbaa !103 + store %union.anon* %1, %union.anon** %2, align 8, !tbaa !109 %3 = bitcast %union.anon* %1 to i8* %4 = bitcast i64* %__dnew.i.i.i.i to i8* - call void @llvm.lifetime.start(i64 8, i8* nonnull %4) #7 - store i64 71, i64* %__dnew.i.i.i.i, align 8, !tbaa !63 - %call5.i.i.i.i = call i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"* nonnull %dir_prefix, i64* nonnull dereferenceable(8) %__dnew.i.i.i.i, i64 0) #7 + call void @llvm.lifetime.start(i64 8, i8* nonnull %4) #2 + store i64 69, i64* %__dnew.i.i.i.i, align 8, !tbaa !66 + %call5.i.i.i.i = call i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"* nonnull %dir_prefix, i64* nonnull dereferenceable(8) %__dnew.i.i.i.i, i64 0) #2 %_M_p.i13.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 0, i32 0 - store i8* %call5.i.i.i.i, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107 - %5 = load i64, i64* %__dnew.i.i.i.i, align 8, !tbaa !63 + store i8* %call5.i.i.i.i, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113 + %5 = load i64, i64* %__dnew.i.i.i.i, align 8, !tbaa !66 %_M_allocated_capacity.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 2, i32 0 - store i64 %5, i64* %_M_allocated_capacity.i.i.i.i.i, align 8, !tbaa !63 - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call5.i.i.i.i, i8* nonnull getelementptr inbounds ([72 x i8], [72 x i8]* @.str.23, i64 0, i64 0), i64 71, i32 1, i1 false) #7 + store i64 %5, i64* %_M_allocated_capacity.i.i.i.i.i, align 8, !tbaa !66 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call5.i.i.i.i, i8* nonnull getelementptr inbounds ([70 x i8], [70 x i8]* @.str.41, i64 0, i64 0), i64 69, i32 1, i1 false) #2 %_M_string_length.i.i.i.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dir_prefix, i64 0, i32 1 - store i64 %5, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104 + store i64 %5, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110 %arrayidx.i.i.i.i.i = getelementptr inbounds i8, i8* %call5.i.i.i.i, i64 %5 - store i8 0, i8* %arrayidx.i.i.i.i.i, align 1, !tbaa !87 - call void @llvm.lifetime.end(i64 8, i8* nonnull %4) #7 + store i8 0, i8* %arrayidx.i.i.i.i.i, align 1, !tbaa !93 + call void @llvm.lifetime.end(i64 8, i8* nonnull %4) #2 %6 = bitcast %"class.std::__cxx11::basic_string"* %input_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %6) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %6) #2 %7 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp1 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %7) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %7) #2 %8 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 2 %9 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp1 to %union.anon** - store %union.anon* %8, %union.anon** %9, align 8, !tbaa !103 + store %union.anon* %8, %union.anon** %9, align 8, !tbaa !109 %10 = bitcast %union.anon* %8 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %10, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @.str.24, i64 0, i64 0), i64 9, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i279 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 1 - store i64 9, i64* %_M_string_length.i.i.i.i.i.i279, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %10, i8* nonnull getelementptr inbounds ([10 x i8], [10 x i8]* @.str.42, i64 0, i64 0), i64 9, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 1 + store i64 9, i64* %_M_string_length.i.i.i.i.i.i287, align 8, !tbaa !110 %11 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 2, i32 1, i64 1 - store i8 0, i8* %11, align 1, !tbaa !87 - %12 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !113 - %13 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !113 - %call3.i.i.i = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp1, i64 0, i64 0, i8* %13, i64 %12) #7, !noalias !113 + store i8 0, i8* %11, align 1, !tbaa !93 + %12 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !131 + %13 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !131 + %call3.i.i.i = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp1, i64 0, i64 0, i8* %13, i64 %12) #2, !noalias !131 %14 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 2 %15 = bitcast %"class.std::__cxx11::basic_string"* %input_path to %union.anon** - store %union.anon* %14, %union.anon** %15, align 8, !tbaa !103, !alias.scope !113 + store %union.anon* %14, %union.anon** %15, align 8, !tbaa !109, !alias.scope !131 %_M_p.i.i23.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 0, i32 0 - %16 = load i8*, i8** %_M_p.i.i23.i.i, align 8, !tbaa !107 + %16 = load i8*, i8** %_M_p.i.i23.i.i, align 8, !tbaa !113 %17 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 2 - %arraydecay.i.i.i.i282 = bitcast %union.anon* %17 to i8* - %cmp.i.i.i283 = icmp eq i8* %16, %arraydecay.i.i.i.i282 - br i1 %cmp.i.i.i283, label %if.then.i.i284, label %if.else.i.i + %arraydecay.i.i.i.i290 = bitcast %union.anon* %17 to i8* + %cmp.i.i.i291 = icmp eq i8* %16, %arraydecay.i.i.i.i290 + br i1 %cmp.i.i.i291, label %if.then.i.i292, label %if.else.i.i -if.then.i.i284: ; preds = %entry +if.then.i.i292: ; preds = %entry %arraydecay.i.i.i = bitcast %union.anon* %14 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i, i8* %16, i64 16, i32 1, i1 false) #7 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i, i8* %16, i64 16, i32 1, i1 false) #2 br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit if.else.i.i: ; preds = %entry %_M_p.i21.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 0, i32 0 - store i8* %16, i8** %_M_p.i21.i.i, align 8, !tbaa !107, !alias.scope !113 + store i8* %16, i8** %_M_p.i21.i.i, align 8, !tbaa !113, !alias.scope !131 %_M_allocated_capacity.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 2, i32 0 - %18 = load i64, i64* %_M_allocated_capacity.i.i, align 8, !tbaa !63 + %18 = load i64, i64* %_M_allocated_capacity.i.i, align 8, !tbaa !66 %_M_allocated_capacity.i.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 2, i32 0 - store i64 %18, i64* %_M_allocated_capacity.i.i.i, align 8, !tbaa !63, !alias.scope !113 + store i64 %18, i64* %_M_allocated_capacity.i.i.i, align 8, !tbaa !66, !alias.scope !131 br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit: ; preds = %if.else.i.i, %if.then.i.i284 +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit: ; preds = %if.else.i.i, %if.then.i.i292 %_M_string_length.i20.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i, i64 0, i32 1 - %19 = load i64, i64* %_M_string_length.i20.i.i, align 8, !tbaa !104 + %19 = load i64, i64* %_M_string_length.i20.i.i, align 8, !tbaa !110 %_M_string_length.i.i2.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 1 - store i64 %19, i64* %_M_string_length.i.i2.i, align 8, !tbaa !104, !alias.scope !113 + store i64 %19, i64* %_M_string_length.i.i2.i, align 8, !tbaa !110, !alias.scope !131 %20 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i to %union.anon** - store %union.anon* %17, %union.anon** %20, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i282, align 1, !tbaa !87 - %_M_p.i.i.i.i285 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 0, i32 0 - %21 = load i8*, i8** %_M_p.i.i.i.i285, align 8, !tbaa !107 - %cmp.i.i.i287 = icmp eq i8* %21, %10 - br i1 %cmp.i.i.i287, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289, label %if.then.i.i288 - -if.then.i.i288: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit - call void @_ZdlPv(i8* %21) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289: ; preds = %if.then.i.i288, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit - call void @llvm.lifetime.end(i64 32, i8* nonnull %7) #7 + store %union.anon* %17, %union.anon** %20, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i290, align 1, !tbaa !93 + %_M_p.i.i.i.i293 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp1, i64 0, i32 0, i32 0 + %21 = load i8*, i8** %_M_p.i.i.i.i293, align 8, !tbaa !113 + %cmp.i.i.i295 = icmp eq i8* %21, %10 + br i1 %cmp.i.i.i295, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297, label %if.then.i.i296 + +if.then.i.i296: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit + call void @_ZdlPv(i8* %21) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297: ; preds = %if.then.i.i296, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit + call void @llvm.lifetime.end(i64 32, i8* nonnull %7) #2 %22 = bitcast %"class.std::__cxx11::basic_string"* %labels_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %22) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %22) #2 %23 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp3 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %23) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %23) #2 %24 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 2 %25 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp3 to %union.anon** - store %union.anon* %24, %union.anon** %25, align 8, !tbaa !103 + store %union.anon* %24, %union.anon** %25, align 8, !tbaa !109 %26 = bitcast %union.anon* %24 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %26, i8* nonnull getelementptr inbounds ([11 x i8], [11 x i8]* @.str.25, i64 0, i64 0), i64 10, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i308 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 1 - store i64 10, i64* %_M_string_length.i.i.i.i.i.i308, align 8, !tbaa !104 - %27 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 2, i32 1, i64 2 - store i8 0, i8* %27, align 2, !tbaa !87 - %28 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !116 - %29 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !116 - %call3.i.i.i313 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp3, i64 0, i64 0, i8* %29, i64 %28) #7, !noalias !116 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %26, i8* nonnull getelementptr inbounds ([13 x i8], [13 x i8]* @.str.43, i64 0, i64 0), i64 12, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i316 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 1 + store i64 12, i64* %_M_string_length.i.i.i.i.i.i316, align 8, !tbaa !110 + %27 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 2, i32 1, i64 4 + store i8 0, i8* %27, align 4, !tbaa !93 + %28 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !134 + %29 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !134 + %call3.i.i.i321 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp3, i64 0, i64 0, i8* %29, i64 %28) #2, !noalias !134 %30 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 2 %31 = bitcast %"class.std::__cxx11::basic_string"* %labels_path to %union.anon** - store %union.anon* %30, %union.anon** %31, align 8, !tbaa !103, !alias.scope !116 - %_M_p.i.i23.i.i314 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 0, i32 0 - %32 = load i8*, i8** %_M_p.i.i23.i.i314, align 8, !tbaa !107 - %33 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 2 - %arraydecay.i.i.i.i315 = bitcast %union.anon* %33 to i8* - %cmp.i.i.i316 = icmp eq i8* %32, %arraydecay.i.i.i.i315 - br i1 %cmp.i.i.i316, label %if.then.i.i318, label %if.else.i.i322 - -if.then.i.i318: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289 - %arraydecay.i.i.i317 = bitcast %union.anon* %30 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i317, i8* %32, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325 - -if.else.i.i322: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit289 - %_M_p.i21.i.i319 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 - store i8* %32, i8** %_M_p.i21.i.i319, align 8, !tbaa !107, !alias.scope !116 - %_M_allocated_capacity.i.i320 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 2, i32 0 - %34 = load i64, i64* %_M_allocated_capacity.i.i320, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 2, i32 0 - store i64 %34, i64* %_M_allocated_capacity.i.i.i321, align 8, !tbaa !63, !alias.scope !116 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325: ; preds = %if.else.i.i322, %if.then.i.i318 - %_M_string_length.i20.i.i323 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i313, i64 0, i32 1 - %35 = load i64, i64* %_M_string_length.i20.i.i323, align 8, !tbaa !104 - %_M_string_length.i.i2.i324 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 1 - store i64 %35, i64* %_M_string_length.i.i2.i324, align 8, !tbaa !104, !alias.scope !116 - %36 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i313 to %union.anon** - store %union.anon* %33, %union.anon** %36, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i323, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i315, align 1, !tbaa !87 - %_M_p.i.i.i.i326 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 0, i32 0 - %37 = load i8*, i8** %_M_p.i.i.i.i326, align 8, !tbaa !107 - %cmp.i.i.i328 = icmp eq i8* %37, %26 - br i1 %cmp.i.i.i328, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330, label %if.then.i.i329 - -if.then.i.i329: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325 - call void @_ZdlPv(i8* %37) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330: ; preds = %if.then.i.i329, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit325 - call void @llvm.lifetime.end(i64 32, i8* nonnull %23) #7 + store %union.anon* %30, %union.anon** %31, align 8, !tbaa !109, !alias.scope !134 + %_M_p.i.i23.i.i322 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 0, i32 0 + %32 = load i8*, i8** %_M_p.i.i23.i.i322, align 8, !tbaa !113 + %33 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 2 + %arraydecay.i.i.i.i323 = bitcast %union.anon* %33 to i8* + %cmp.i.i.i324 = icmp eq i8* %32, %arraydecay.i.i.i.i323 + br i1 %cmp.i.i.i324, label %if.then.i.i326, label %if.else.i.i330 + +if.then.i.i326: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297 + %arraydecay.i.i.i325 = bitcast %union.anon* %30 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i325, i8* %32, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333 + +if.else.i.i330: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit297 + %_M_p.i21.i.i327 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 + store i8* %32, i8** %_M_p.i21.i.i327, align 8, !tbaa !113, !alias.scope !134 + %_M_allocated_capacity.i.i328 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 2, i32 0 + %34 = load i64, i64* %_M_allocated_capacity.i.i328, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i329 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 2, i32 0 + store i64 %34, i64* %_M_allocated_capacity.i.i.i329, align 8, !tbaa !66, !alias.scope !134 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333: ; preds = %if.else.i.i330, %if.then.i.i326 + %_M_string_length.i20.i.i331 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i321, i64 0, i32 1 + %35 = load i64, i64* %_M_string_length.i20.i.i331, align 8, !tbaa !110 + %_M_string_length.i.i2.i332 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 1 + store i64 %35, i64* %_M_string_length.i.i2.i332, align 8, !tbaa !110, !alias.scope !134 + %36 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i321 to %union.anon** + store %union.anon* %33, %union.anon** %36, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i331, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i323, align 1, !tbaa !93 + %_M_p.i.i.i.i334 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp3, i64 0, i32 0, i32 0 + %37 = load i8*, i8** %_M_p.i.i.i.i334, align 8, !tbaa !113 + %cmp.i.i.i336 = icmp eq i8* %37, %26 + br i1 %cmp.i.i.i336, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338, label %if.then.i.i337 + +if.then.i.i337: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333 + call void @_ZdlPv(i8* %37) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338: ; preds = %if.then.i.i337, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit333 + call void @llvm.lifetime.end(i64 32, i8* nonnull %23) #2 %38 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %38) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %38) #2 %39 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp5 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %39) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %39) #2 %40 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 2 %41 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp5 to %union.anon** - store %union.anon* %40, %union.anon** %41, align 8, !tbaa !103 + store %union.anon* %40, %union.anon** %41, align 8, !tbaa !109 %42 = bitcast %union.anon* %40 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %42, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.26, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i349 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i349, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %42, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.44, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i357 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i357, align 8, !tbaa !110 %43 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %43, align 2, !tbaa !87 - %44 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !119 - %45 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !119 - %call3.i.i.i354 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp5, i64 0, i64 0, i8* %45, i64 %44) #7, !noalias !119 + store i8 0, i8* %43, align 2, !tbaa !93 + %44 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !137 + %45 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !137 + %call3.i.i.i362 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp5, i64 0, i64 0, i8* %45, i64 %44) #2, !noalias !137 %46 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 2 %47 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_w_path to %union.anon** - store %union.anon* %46, %union.anon** %47, align 8, !tbaa !103, !alias.scope !119 - %_M_p.i.i23.i.i355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 0, i32 0 - %48 = load i8*, i8** %_M_p.i.i23.i.i355, align 8, !tbaa !107 - %49 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 2 - %arraydecay.i.i.i.i356 = bitcast %union.anon* %49 to i8* - %cmp.i.i.i357 = icmp eq i8* %48, %arraydecay.i.i.i.i356 - br i1 %cmp.i.i.i357, label %if.then.i.i359, label %if.else.i.i363 - -if.then.i.i359: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330 - %arraydecay.i.i.i358 = bitcast %union.anon* %46 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i358, i8* %48, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366 - -if.else.i.i363: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit330 - %_M_p.i21.i.i360 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 0, i32 0 - store i8* %48, i8** %_M_p.i21.i.i360, align 8, !tbaa !107, !alias.scope !119 - %_M_allocated_capacity.i.i361 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 2, i32 0 - %50 = load i64, i64* %_M_allocated_capacity.i.i361, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i362 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 2, i32 0 - store i64 %50, i64* %_M_allocated_capacity.i.i.i362, align 8, !tbaa !63, !alias.scope !119 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366: ; preds = %if.else.i.i363, %if.then.i.i359 - %_M_string_length.i20.i.i364 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i354, i64 0, i32 1 - %51 = load i64, i64* %_M_string_length.i20.i.i364, align 8, !tbaa !104 - %_M_string_length.i.i2.i365 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 1 - store i64 %51, i64* %_M_string_length.i.i2.i365, align 8, !tbaa !104, !alias.scope !119 - %52 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i354 to %union.anon** - store %union.anon* %49, %union.anon** %52, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i364, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i356, align 1, !tbaa !87 - %_M_p.i.i.i.i367 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 0, i32 0 - %53 = load i8*, i8** %_M_p.i.i.i.i367, align 8, !tbaa !107 - %cmp.i.i.i369 = icmp eq i8* %53, %42 - br i1 %cmp.i.i.i369, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371, label %if.then.i.i370 - -if.then.i.i370: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366 - call void @_ZdlPv(i8* %53) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371: ; preds = %if.then.i.i370, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit366 - call void @llvm.lifetime.end(i64 32, i8* nonnull %39) #7 + store %union.anon* %46, %union.anon** %47, align 8, !tbaa !109, !alias.scope !137 + %_M_p.i.i23.i.i363 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 0, i32 0 + %48 = load i8*, i8** %_M_p.i.i23.i.i363, align 8, !tbaa !113 + %49 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 2 + %arraydecay.i.i.i.i364 = bitcast %union.anon* %49 to i8* + %cmp.i.i.i365 = icmp eq i8* %48, %arraydecay.i.i.i.i364 + br i1 %cmp.i.i.i365, label %if.then.i.i367, label %if.else.i.i371 + +if.then.i.i367: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338 + %arraydecay.i.i.i366 = bitcast %union.anon* %46 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i366, i8* %48, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374 + +if.else.i.i371: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit338 + %_M_p.i21.i.i368 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 0, i32 0 + store i8* %48, i8** %_M_p.i21.i.i368, align 8, !tbaa !113, !alias.scope !137 + %_M_allocated_capacity.i.i369 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 2, i32 0 + %50 = load i64, i64* %_M_allocated_capacity.i.i369, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i370 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 2, i32 0 + store i64 %50, i64* %_M_allocated_capacity.i.i.i370, align 8, !tbaa !66, !alias.scope !137 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374: ; preds = %if.else.i.i371, %if.then.i.i367 + %_M_string_length.i20.i.i372 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i362, i64 0, i32 1 + %51 = load i64, i64* %_M_string_length.i20.i.i372, align 8, !tbaa !110 + %_M_string_length.i.i2.i373 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 1 + store i64 %51, i64* %_M_string_length.i.i2.i373, align 8, !tbaa !110, !alias.scope !137 + %52 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i362 to %union.anon** + store %union.anon* %49, %union.anon** %52, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i372, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i364, align 1, !tbaa !93 + %_M_p.i.i.i.i375 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp5, i64 0, i32 0, i32 0 + %53 = load i8*, i8** %_M_p.i.i.i.i375, align 8, !tbaa !113 + %cmp.i.i.i377 = icmp eq i8* %53, %42 + br i1 %cmp.i.i.i377, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379, label %if.then.i.i378 + +if.then.i.i378: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374 + call void @_ZdlPv(i8* %53) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379: ; preds = %if.then.i.i378, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit374 + call void @llvm.lifetime.end(i64 32, i8* nonnull %39) #2 %_M_p.i.i = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_w_path, i64 0, i32 0, i32 0 - %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !107 - %call7 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %54, i32 0, i32 64, i32 3, i32 3, i32 3) + %54 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %call7 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %54, i32 0, i64 64, i64 3, i64 3, i64 3) %55 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %55) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %55) #2 %56 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp8 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %56) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %56) #2 %57 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 2 %58 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp8 to %union.anon** - store %union.anon* %57, %union.anon** %58, align 8, !tbaa !103 + store %union.anon* %57, %union.anon** %58, align 8, !tbaa !109 %59 = bitcast %union.anon* %57 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %59, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.27, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i395 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i395, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %59, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.45, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i403 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i403, align 8, !tbaa !110 %60 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %60, align 2, !tbaa !87 - %61 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !122 - %62 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !122 - %call3.i.i.i400 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp8, i64 0, i64 0, i8* %62, i64 %61) #7, !noalias !122 + store i8 0, i8* %60, align 2, !tbaa !93 + %61 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !140 + %62 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !140 + %call3.i.i.i408 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp8, i64 0, i64 0, i8* %62, i64 %61) #2, !noalias !140 %63 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 2 %64 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_1_b_path to %union.anon** - store %union.anon* %63, %union.anon** %64, align 8, !tbaa !103, !alias.scope !122 - %_M_p.i.i23.i.i401 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 0, i32 0 - %65 = load i8*, i8** %_M_p.i.i23.i.i401, align 8, !tbaa !107 - %66 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 2 - %arraydecay.i.i.i.i402 = bitcast %union.anon* %66 to i8* - %cmp.i.i.i403 = icmp eq i8* %65, %arraydecay.i.i.i.i402 - br i1 %cmp.i.i.i403, label %if.then.i.i405, label %if.else.i.i409 - -if.then.i.i405: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371 - %arraydecay.i.i.i404 = bitcast %union.anon* %63 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i404, i8* %65, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412 - -if.else.i.i409: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit371 - %_M_p.i21.i.i406 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 - store i8* %65, i8** %_M_p.i21.i.i406, align 8, !tbaa !107, !alias.scope !122 - %_M_allocated_capacity.i.i407 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 2, i32 0 - %67 = load i64, i64* %_M_allocated_capacity.i.i407, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i408 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 2, i32 0 - store i64 %67, i64* %_M_allocated_capacity.i.i.i408, align 8, !tbaa !63, !alias.scope !122 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412: ; preds = %if.else.i.i409, %if.then.i.i405 - %_M_string_length.i20.i.i410 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i400, i64 0, i32 1 - %68 = load i64, i64* %_M_string_length.i20.i.i410, align 8, !tbaa !104 - %_M_string_length.i.i2.i411 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 1 - store i64 %68, i64* %_M_string_length.i.i2.i411, align 8, !tbaa !104, !alias.scope !122 - %69 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i400 to %union.anon** - store %union.anon* %66, %union.anon** %69, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i410, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i402, align 1, !tbaa !87 - %_M_p.i.i.i.i413 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 0, i32 0 - %70 = load i8*, i8** %_M_p.i.i.i.i413, align 8, !tbaa !107 - %cmp.i.i.i415 = icmp eq i8* %70, %59 - br i1 %cmp.i.i.i415, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417, label %if.then.i.i416 - -if.then.i.i416: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412 - call void @_ZdlPv(i8* %70) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417: ; preds = %if.then.i.i416, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit412 - call void @llvm.lifetime.end(i64 32, i8* nonnull %56) #7 - %_M_p.i.i418 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 - %71 = load i8*, i8** %_M_p.i.i418, align 8, !tbaa !107 - %call11 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %71, i32 0, i32 1, i32 64, i32 1, i32 1) + store %union.anon* %63, %union.anon** %64, align 8, !tbaa !109, !alias.scope !140 + %_M_p.i.i23.i.i409 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 0, i32 0 + %65 = load i8*, i8** %_M_p.i.i23.i.i409, align 8, !tbaa !113 + %66 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 2 + %arraydecay.i.i.i.i410 = bitcast %union.anon* %66 to i8* + %cmp.i.i.i411 = icmp eq i8* %65, %arraydecay.i.i.i.i410 + br i1 %cmp.i.i.i411, label %if.then.i.i413, label %if.else.i.i417 + +if.then.i.i413: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379 + %arraydecay.i.i.i412 = bitcast %union.anon* %63 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i412, i8* %65, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420 + +if.else.i.i417: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit379 + %_M_p.i21.i.i414 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 + store i8* %65, i8** %_M_p.i21.i.i414, align 8, !tbaa !113, !alias.scope !140 + %_M_allocated_capacity.i.i415 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 2, i32 0 + %67 = load i64, i64* %_M_allocated_capacity.i.i415, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i416 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 2, i32 0 + store i64 %67, i64* %_M_allocated_capacity.i.i.i416, align 8, !tbaa !66, !alias.scope !140 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420: ; preds = %if.else.i.i417, %if.then.i.i413 + %_M_string_length.i20.i.i418 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i408, i64 0, i32 1 + %68 = load i64, i64* %_M_string_length.i20.i.i418, align 8, !tbaa !110 + %_M_string_length.i.i2.i419 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 1 + store i64 %68, i64* %_M_string_length.i.i2.i419, align 8, !tbaa !110, !alias.scope !140 + %69 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i408 to %union.anon** + store %union.anon* %66, %union.anon** %69, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i418, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i410, align 1, !tbaa !93 + %_M_p.i.i.i.i421 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp8, i64 0, i32 0, i32 0 + %70 = load i8*, i8** %_M_p.i.i.i.i421, align 8, !tbaa !113 + %cmp.i.i.i423 = icmp eq i8* %70, %59 + br i1 %cmp.i.i.i423, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425, label %if.then.i.i424 + +if.then.i.i424: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420 + call void @_ZdlPv(i8* %70) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425: ; preds = %if.then.i.i424, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit420 + call void @llvm.lifetime.end(i64 32, i8* nonnull %56) #2 + %_M_p.i.i426 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_1_b_path, i64 0, i32 0, i32 0 + %71 = load i8*, i8** %_M_p.i.i426, align 8, !tbaa !113 + %call11 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %71, i32 0, i64 1, i64 64, i64 1, i64 1) %72 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %72) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %72) #2 %73 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp12 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %73) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %73) #2 %74 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 2 %75 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp12 to %union.anon** - store %union.anon* %74, %union.anon** %75, align 8, !tbaa !103 + store %union.anon* %74, %union.anon** %75, align 8, !tbaa !109 %76 = bitcast %union.anon* %74 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %76, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.28, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i442 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i442, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %76, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.46, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i450 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i450, align 8, !tbaa !110 %77 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %77, align 2, !tbaa !87 - %78 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !125 - %79 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !125 - %call3.i.i.i447 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp12, i64 0, i64 0, i8* %79, i64 %78) #7, !noalias !125 + store i8 0, i8* %77, align 2, !tbaa !93 + %78 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !143 + %79 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !143 + %call3.i.i.i455 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp12, i64 0, i64 0, i8* %79, i64 %78) #2, !noalias !143 %80 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 2 %81 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_w_path to %union.anon** - store %union.anon* %80, %union.anon** %81, align 8, !tbaa !103, !alias.scope !125 - %_M_p.i.i23.i.i448 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 0, i32 0 - %82 = load i8*, i8** %_M_p.i.i23.i.i448, align 8, !tbaa !107 - %83 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 2 - %arraydecay.i.i.i.i449 = bitcast %union.anon* %83 to i8* - %cmp.i.i.i450 = icmp eq i8* %82, %arraydecay.i.i.i.i449 - br i1 %cmp.i.i.i450, label %if.then.i.i452, label %if.else.i.i456 - -if.then.i.i452: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417 - %arraydecay.i.i.i451 = bitcast %union.anon* %80 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i451, i8* %82, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459 - -if.else.i.i456: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit417 - %_M_p.i21.i.i453 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 - store i8* %82, i8** %_M_p.i21.i.i453, align 8, !tbaa !107, !alias.scope !125 - %_M_allocated_capacity.i.i454 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 2, i32 0 - %84 = load i64, i64* %_M_allocated_capacity.i.i454, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i455 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 2, i32 0 - store i64 %84, i64* %_M_allocated_capacity.i.i.i455, align 8, !tbaa !63, !alias.scope !125 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459: ; preds = %if.else.i.i456, %if.then.i.i452 - %_M_string_length.i20.i.i457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i447, i64 0, i32 1 - %85 = load i64, i64* %_M_string_length.i20.i.i457, align 8, !tbaa !104 - %_M_string_length.i.i2.i458 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 1 - store i64 %85, i64* %_M_string_length.i.i2.i458, align 8, !tbaa !104, !alias.scope !125 - %86 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i447 to %union.anon** - store %union.anon* %83, %union.anon** %86, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i457, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i449, align 1, !tbaa !87 - %_M_p.i.i.i.i460 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 0, i32 0 - %87 = load i8*, i8** %_M_p.i.i.i.i460, align 8, !tbaa !107 - %cmp.i.i.i462 = icmp eq i8* %87, %76 - br i1 %cmp.i.i.i462, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464, label %if.then.i.i463 - -if.then.i.i463: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459 - call void @_ZdlPv(i8* %87) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464: ; preds = %if.then.i.i463, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit459 - call void @llvm.lifetime.end(i64 32, i8* nonnull %73) #7 - %_M_p.i.i465 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 - %88 = load i8*, i8** %_M_p.i.i465, align 8, !tbaa !107 - %call15 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %88, i32 0, i32 64, i32 64, i32 3, i32 3) + store %union.anon* %80, %union.anon** %81, align 8, !tbaa !109, !alias.scope !143 + %_M_p.i.i23.i.i456 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 0, i32 0 + %82 = load i8*, i8** %_M_p.i.i23.i.i456, align 8, !tbaa !113 + %83 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 2 + %arraydecay.i.i.i.i457 = bitcast %union.anon* %83 to i8* + %cmp.i.i.i458 = icmp eq i8* %82, %arraydecay.i.i.i.i457 + br i1 %cmp.i.i.i458, label %if.then.i.i460, label %if.else.i.i464 + +if.then.i.i460: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425 + %arraydecay.i.i.i459 = bitcast %union.anon* %80 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i459, i8* %82, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467 + +if.else.i.i464: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit425 + %_M_p.i21.i.i461 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 + store i8* %82, i8** %_M_p.i21.i.i461, align 8, !tbaa !113, !alias.scope !143 + %_M_allocated_capacity.i.i462 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 2, i32 0 + %84 = load i64, i64* %_M_allocated_capacity.i.i462, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i463 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 2, i32 0 + store i64 %84, i64* %_M_allocated_capacity.i.i.i463, align 8, !tbaa !66, !alias.scope !143 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467: ; preds = %if.else.i.i464, %if.then.i.i460 + %_M_string_length.i20.i.i465 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i455, i64 0, i32 1 + %85 = load i64, i64* %_M_string_length.i20.i.i465, align 8, !tbaa !110 + %_M_string_length.i.i2.i466 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 1 + store i64 %85, i64* %_M_string_length.i.i2.i466, align 8, !tbaa !110, !alias.scope !143 + %86 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i455 to %union.anon** + store %union.anon* %83, %union.anon** %86, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i465, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i457, align 1, !tbaa !93 + %_M_p.i.i.i.i468 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp12, i64 0, i32 0, i32 0 + %87 = load i8*, i8** %_M_p.i.i.i.i468, align 8, !tbaa !113 + %cmp.i.i.i470 = icmp eq i8* %87, %76 + br i1 %cmp.i.i.i470, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472, label %if.then.i.i471 + +if.then.i.i471: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467 + call void @_ZdlPv(i8* %87) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472: ; preds = %if.then.i.i471, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit467 + call void @llvm.lifetime.end(i64 32, i8* nonnull %73) #2 + %_M_p.i.i473 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_w_path, i64 0, i32 0, i32 0 + %88 = load i8*, i8** %_M_p.i.i473, align 8, !tbaa !113 + %call15 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %88, i32 0, i64 64, i64 64, i64 3, i64 3) %89 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %89) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %89) #2 %90 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp16 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %90) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %90) #2 %91 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 2 %92 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp16 to %union.anon** - store %union.anon* %91, %union.anon** %92, align 8, !tbaa !103 + store %union.anon* %91, %union.anon** %92, align 8, !tbaa !109 %93 = bitcast %union.anon* %91 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %93, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.29, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i489 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i489, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %93, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.47, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i497 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i497, align 8, !tbaa !110 %94 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %94, align 2, !tbaa !87 - %95 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !128 - %96 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !128 - %call3.i.i.i494 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp16, i64 0, i64 0, i8* %96, i64 %95) #7, !noalias !128 + store i8 0, i8* %94, align 2, !tbaa !93 + %95 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !146 + %96 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !146 + %call3.i.i.i502 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp16, i64 0, i64 0, i8* %96, i64 %95) #2, !noalias !146 %97 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 2 %98 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_2_b_path to %union.anon** - store %union.anon* %97, %union.anon** %98, align 8, !tbaa !103, !alias.scope !128 - %_M_p.i.i23.i.i495 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 0, i32 0 - %99 = load i8*, i8** %_M_p.i.i23.i.i495, align 8, !tbaa !107 - %100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 2 - %arraydecay.i.i.i.i496 = bitcast %union.anon* %100 to i8* - %cmp.i.i.i497 = icmp eq i8* %99, %arraydecay.i.i.i.i496 - br i1 %cmp.i.i.i497, label %if.then.i.i499, label %if.else.i.i503 - -if.then.i.i499: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464 - %arraydecay.i.i.i498 = bitcast %union.anon* %97 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i498, i8* %99, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506 - -if.else.i.i503: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit464 - %_M_p.i21.i.i500 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 - store i8* %99, i8** %_M_p.i21.i.i500, align 8, !tbaa !107, !alias.scope !128 - %_M_allocated_capacity.i.i501 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 2, i32 0 - %101 = load i64, i64* %_M_allocated_capacity.i.i501, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i502 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 2, i32 0 - store i64 %101, i64* %_M_allocated_capacity.i.i.i502, align 8, !tbaa !63, !alias.scope !128 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506: ; preds = %if.else.i.i503, %if.then.i.i499 - %_M_string_length.i20.i.i504 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i494, i64 0, i32 1 - %102 = load i64, i64* %_M_string_length.i20.i.i504, align 8, !tbaa !104 - %_M_string_length.i.i2.i505 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 1 - store i64 %102, i64* %_M_string_length.i.i2.i505, align 8, !tbaa !104, !alias.scope !128 - %103 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i494 to %union.anon** - store %union.anon* %100, %union.anon** %103, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i504, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i496, align 1, !tbaa !87 - %_M_p.i.i.i.i507 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 0, i32 0 - %104 = load i8*, i8** %_M_p.i.i.i.i507, align 8, !tbaa !107 - %cmp.i.i.i509 = icmp eq i8* %104, %93 - br i1 %cmp.i.i.i509, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511, label %if.then.i.i510 - -if.then.i.i510: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506 - call void @_ZdlPv(i8* %104) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511: ; preds = %if.then.i.i510, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit506 - call void @llvm.lifetime.end(i64 32, i8* nonnull %90) #7 - %_M_p.i.i512 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 - %105 = load i8*, i8** %_M_p.i.i512, align 8, !tbaa !107 - %call19 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %105, i32 0, i32 1, i32 64, i32 1, i32 1) + store %union.anon* %97, %union.anon** %98, align 8, !tbaa !109, !alias.scope !146 + %_M_p.i.i23.i.i503 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 0, i32 0 + %99 = load i8*, i8** %_M_p.i.i23.i.i503, align 8, !tbaa !113 + %100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 2 + %arraydecay.i.i.i.i504 = bitcast %union.anon* %100 to i8* + %cmp.i.i.i505 = icmp eq i8* %99, %arraydecay.i.i.i.i504 + br i1 %cmp.i.i.i505, label %if.then.i.i507, label %if.else.i.i511 + +if.then.i.i507: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472 + %arraydecay.i.i.i506 = bitcast %union.anon* %97 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i506, i8* %99, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514 + +if.else.i.i511: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit472 + %_M_p.i21.i.i508 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 + store i8* %99, i8** %_M_p.i21.i.i508, align 8, !tbaa !113, !alias.scope !146 + %_M_allocated_capacity.i.i509 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 2, i32 0 + %101 = load i64, i64* %_M_allocated_capacity.i.i509, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i510 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 2, i32 0 + store i64 %101, i64* %_M_allocated_capacity.i.i.i510, align 8, !tbaa !66, !alias.scope !146 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514: ; preds = %if.else.i.i511, %if.then.i.i507 + %_M_string_length.i20.i.i512 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i502, i64 0, i32 1 + %102 = load i64, i64* %_M_string_length.i20.i.i512, align 8, !tbaa !110 + %_M_string_length.i.i2.i513 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 1 + store i64 %102, i64* %_M_string_length.i.i2.i513, align 8, !tbaa !110, !alias.scope !146 + %103 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i502 to %union.anon** + store %union.anon* %100, %union.anon** %103, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i512, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i504, align 1, !tbaa !93 + %_M_p.i.i.i.i515 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp16, i64 0, i32 0, i32 0 + %104 = load i8*, i8** %_M_p.i.i.i.i515, align 8, !tbaa !113 + %cmp.i.i.i517 = icmp eq i8* %104, %93 + br i1 %cmp.i.i.i517, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519, label %if.then.i.i518 + +if.then.i.i518: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514 + call void @_ZdlPv(i8* %104) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519: ; preds = %if.then.i.i518, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit514 + call void @llvm.lifetime.end(i64 32, i8* nonnull %90) #2 + %_M_p.i.i520 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_2_b_path, i64 0, i32 0, i32 0 + %105 = load i8*, i8** %_M_p.i.i520, align 8, !tbaa !113 + %call19 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %105, i32 0, i64 1, i64 64, i64 1, i64 1) %106 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %106) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %106) #2 %107 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp20 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %107) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %107) #2 %108 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 2 %109 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp20 to %union.anon** - store %union.anon* %108, %union.anon** %109, align 8, !tbaa !103 + store %union.anon* %108, %union.anon** %109, align 8, !tbaa !109 %110 = bitcast %union.anon* %108 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %110, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.30, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i536 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i536, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %110, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.48, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i544 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i544, align 8, !tbaa !110 %111 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %111, align 2, !tbaa !87 - %112 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !131 - %113 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !131 - %call3.i.i.i541 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp20, i64 0, i64 0, i8* %113, i64 %112) #7, !noalias !131 + store i8 0, i8* %111, align 2, !tbaa !93 + %112 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !149 + %113 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !149 + %call3.i.i.i549 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp20, i64 0, i64 0, i8* %113, i64 %112) #2, !noalias !149 %114 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 2 %115 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_w_path to %union.anon** - store %union.anon* %114, %union.anon** %115, align 8, !tbaa !103, !alias.scope !131 - %_M_p.i.i23.i.i542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 0, i32 0 - %116 = load i8*, i8** %_M_p.i.i23.i.i542, align 8, !tbaa !107 - %117 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 2 - %arraydecay.i.i.i.i543 = bitcast %union.anon* %117 to i8* - %cmp.i.i.i544 = icmp eq i8* %116, %arraydecay.i.i.i.i543 - br i1 %cmp.i.i.i544, label %if.then.i.i546, label %if.else.i.i550 - -if.then.i.i546: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511 - %arraydecay.i.i.i545 = bitcast %union.anon* %114 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i545, i8* %116, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553 - -if.else.i.i550: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit511 - %_M_p.i21.i.i547 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 - store i8* %116, i8** %_M_p.i21.i.i547, align 8, !tbaa !107, !alias.scope !131 - %_M_allocated_capacity.i.i548 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 2, i32 0 - %118 = load i64, i64* %_M_allocated_capacity.i.i548, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i549 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 2, i32 0 - store i64 %118, i64* %_M_allocated_capacity.i.i.i549, align 8, !tbaa !63, !alias.scope !131 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553: ; preds = %if.else.i.i550, %if.then.i.i546 - %_M_string_length.i20.i.i551 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i541, i64 0, i32 1 - %119 = load i64, i64* %_M_string_length.i20.i.i551, align 8, !tbaa !104 - %_M_string_length.i.i2.i552 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 1 - store i64 %119, i64* %_M_string_length.i.i2.i552, align 8, !tbaa !104, !alias.scope !131 - %120 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i541 to %union.anon** - store %union.anon* %117, %union.anon** %120, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i551, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i543, align 1, !tbaa !87 - %_M_p.i.i.i.i554 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 0, i32 0 - %121 = load i8*, i8** %_M_p.i.i.i.i554, align 8, !tbaa !107 - %cmp.i.i.i556 = icmp eq i8* %121, %110 - br i1 %cmp.i.i.i556, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558, label %if.then.i.i557 - -if.then.i.i557: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553 - call void @_ZdlPv(i8* %121) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558: ; preds = %if.then.i.i557, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit553 - call void @llvm.lifetime.end(i64 32, i8* nonnull %107) #7 - %_M_p.i.i559 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 - %122 = load i8*, i8** %_M_p.i.i559, align 8, !tbaa !107 - %call23 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %122, i32 0, i32 128, i32 64, i32 3, i32 3) + store %union.anon* %114, %union.anon** %115, align 8, !tbaa !109, !alias.scope !149 + %_M_p.i.i23.i.i550 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 0, i32 0 + %116 = load i8*, i8** %_M_p.i.i23.i.i550, align 8, !tbaa !113 + %117 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 2 + %arraydecay.i.i.i.i551 = bitcast %union.anon* %117 to i8* + %cmp.i.i.i552 = icmp eq i8* %116, %arraydecay.i.i.i.i551 + br i1 %cmp.i.i.i552, label %if.then.i.i554, label %if.else.i.i558 + +if.then.i.i554: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519 + %arraydecay.i.i.i553 = bitcast %union.anon* %114 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i553, i8* %116, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561 + +if.else.i.i558: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit519 + %_M_p.i21.i.i555 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 + store i8* %116, i8** %_M_p.i21.i.i555, align 8, !tbaa !113, !alias.scope !149 + %_M_allocated_capacity.i.i556 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 2, i32 0 + %118 = load i64, i64* %_M_allocated_capacity.i.i556, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i557 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 2, i32 0 + store i64 %118, i64* %_M_allocated_capacity.i.i.i557, align 8, !tbaa !66, !alias.scope !149 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561: ; preds = %if.else.i.i558, %if.then.i.i554 + %_M_string_length.i20.i.i559 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i549, i64 0, i32 1 + %119 = load i64, i64* %_M_string_length.i20.i.i559, align 8, !tbaa !110 + %_M_string_length.i.i2.i560 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 1 + store i64 %119, i64* %_M_string_length.i.i2.i560, align 8, !tbaa !110, !alias.scope !149 + %120 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i549 to %union.anon** + store %union.anon* %117, %union.anon** %120, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i559, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i551, align 1, !tbaa !93 + %_M_p.i.i.i.i562 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp20, i64 0, i32 0, i32 0 + %121 = load i8*, i8** %_M_p.i.i.i.i562, align 8, !tbaa !113 + %cmp.i.i.i564 = icmp eq i8* %121, %110 + br i1 %cmp.i.i.i564, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566, label %if.then.i.i565 + +if.then.i.i565: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561 + call void @_ZdlPv(i8* %121) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566: ; preds = %if.then.i.i565, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit561 + call void @llvm.lifetime.end(i64 32, i8* nonnull %107) #2 + %_M_p.i.i567 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_w_path, i64 0, i32 0, i32 0 + %122 = load i8*, i8** %_M_p.i.i567, align 8, !tbaa !113 + %call23 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %122, i32 0, i64 128, i64 64, i64 3, i64 3) %123 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %123) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %123) #2 %124 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp24 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %124) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %124) #2 %125 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 2 %126 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp24 to %union.anon** - store %union.anon* %125, %union.anon** %126, align 8, !tbaa !103 + store %union.anon* %125, %union.anon** %126, align 8, !tbaa !109 %127 = bitcast %union.anon* %125 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %127, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.31, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i583 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i583, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %127, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.49, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i591 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i591, align 8, !tbaa !110 %128 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %128, align 2, !tbaa !87 - %129 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !134 - %130 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !134 - %call3.i.i.i588 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp24, i64 0, i64 0, i8* %130, i64 %129) #7, !noalias !134 + store i8 0, i8* %128, align 2, !tbaa !93 + %129 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !152 + %130 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !152 + %call3.i.i.i596 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp24, i64 0, i64 0, i8* %130, i64 %129) #2, !noalias !152 %131 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 2 %132 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_3_b_path to %union.anon** - store %union.anon* %131, %union.anon** %132, align 8, !tbaa !103, !alias.scope !134 - %_M_p.i.i23.i.i589 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 0, i32 0 - %133 = load i8*, i8** %_M_p.i.i23.i.i589, align 8, !tbaa !107 - %134 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 2 - %arraydecay.i.i.i.i590 = bitcast %union.anon* %134 to i8* - %cmp.i.i.i591 = icmp eq i8* %133, %arraydecay.i.i.i.i590 - br i1 %cmp.i.i.i591, label %if.then.i.i593, label %if.else.i.i597 - -if.then.i.i593: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558 - %arraydecay.i.i.i592 = bitcast %union.anon* %131 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i592, i8* %133, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600 - -if.else.i.i597: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit558 - %_M_p.i21.i.i594 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 - store i8* %133, i8** %_M_p.i21.i.i594, align 8, !tbaa !107, !alias.scope !134 - %_M_allocated_capacity.i.i595 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 2, i32 0 - %135 = load i64, i64* %_M_allocated_capacity.i.i595, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i596 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 2, i32 0 - store i64 %135, i64* %_M_allocated_capacity.i.i.i596, align 8, !tbaa !63, !alias.scope !134 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600: ; preds = %if.else.i.i597, %if.then.i.i593 - %_M_string_length.i20.i.i598 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i588, i64 0, i32 1 - %136 = load i64, i64* %_M_string_length.i20.i.i598, align 8, !tbaa !104 - %_M_string_length.i.i2.i599 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 1 - store i64 %136, i64* %_M_string_length.i.i2.i599, align 8, !tbaa !104, !alias.scope !134 - %137 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i588 to %union.anon** - store %union.anon* %134, %union.anon** %137, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i598, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i590, align 1, !tbaa !87 - %_M_p.i.i.i.i601 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 0, i32 0 - %138 = load i8*, i8** %_M_p.i.i.i.i601, align 8, !tbaa !107 - %cmp.i.i.i603 = icmp eq i8* %138, %127 - br i1 %cmp.i.i.i603, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605, label %if.then.i.i604 - -if.then.i.i604: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600 - call void @_ZdlPv(i8* %138) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605: ; preds = %if.then.i.i604, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit600 - call void @llvm.lifetime.end(i64 32, i8* nonnull %124) #7 - %_M_p.i.i606 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 - %139 = load i8*, i8** %_M_p.i.i606, align 8, !tbaa !107 - %call27 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %139, i32 0, i32 1, i32 128, i32 1, i32 1) + store %union.anon* %131, %union.anon** %132, align 8, !tbaa !109, !alias.scope !152 + %_M_p.i.i23.i.i597 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 0, i32 0 + %133 = load i8*, i8** %_M_p.i.i23.i.i597, align 8, !tbaa !113 + %134 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 2 + %arraydecay.i.i.i.i598 = bitcast %union.anon* %134 to i8* + %cmp.i.i.i599 = icmp eq i8* %133, %arraydecay.i.i.i.i598 + br i1 %cmp.i.i.i599, label %if.then.i.i601, label %if.else.i.i605 + +if.then.i.i601: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566 + %arraydecay.i.i.i600 = bitcast %union.anon* %131 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i600, i8* %133, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608 + +if.else.i.i605: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit566 + %_M_p.i21.i.i602 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 + store i8* %133, i8** %_M_p.i21.i.i602, align 8, !tbaa !113, !alias.scope !152 + %_M_allocated_capacity.i.i603 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 2, i32 0 + %135 = load i64, i64* %_M_allocated_capacity.i.i603, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i604 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 2, i32 0 + store i64 %135, i64* %_M_allocated_capacity.i.i.i604, align 8, !tbaa !66, !alias.scope !152 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608: ; preds = %if.else.i.i605, %if.then.i.i601 + %_M_string_length.i20.i.i606 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i596, i64 0, i32 1 + %136 = load i64, i64* %_M_string_length.i20.i.i606, align 8, !tbaa !110 + %_M_string_length.i.i2.i607 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 1 + store i64 %136, i64* %_M_string_length.i.i2.i607, align 8, !tbaa !110, !alias.scope !152 + %137 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i596 to %union.anon** + store %union.anon* %134, %union.anon** %137, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i606, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i598, align 1, !tbaa !93 + %_M_p.i.i.i.i609 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp24, i64 0, i32 0, i32 0 + %138 = load i8*, i8** %_M_p.i.i.i.i609, align 8, !tbaa !113 + %cmp.i.i.i611 = icmp eq i8* %138, %127 + br i1 %cmp.i.i.i611, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613, label %if.then.i.i612 + +if.then.i.i612: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608 + call void @_ZdlPv(i8* %138) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613: ; preds = %if.then.i.i612, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit608 + call void @llvm.lifetime.end(i64 32, i8* nonnull %124) #2 + %_M_p.i.i614 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_3_b_path, i64 0, i32 0, i32 0 + %139 = load i8*, i8** %_M_p.i.i614, align 8, !tbaa !113 + %call27 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %139, i32 0, i64 1, i64 128, i64 1, i64 1) %140 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %140) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %140) #2 %141 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp28 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %141) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %141) #2 %142 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 2 %143 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp28 to %union.anon** - store %union.anon* %142, %union.anon** %143, align 8, !tbaa !103 + store %union.anon* %142, %union.anon** %143, align 8, !tbaa !109 %144 = bitcast %union.anon* %142 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %144, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.32, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i630 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i630, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %144, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.50, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i638 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i638, align 8, !tbaa !110 %145 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %145, align 2, !tbaa !87 - %146 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !137 - %147 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !137 - %call3.i.i.i635 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp28, i64 0, i64 0, i8* %147, i64 %146) #7, !noalias !137 + store i8 0, i8* %145, align 2, !tbaa !93 + %146 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !155 + %147 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !155 + %call3.i.i.i643 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp28, i64 0, i64 0, i8* %147, i64 %146) #2, !noalias !155 %148 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 2 %149 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_w_path to %union.anon** - store %union.anon* %148, %union.anon** %149, align 8, !tbaa !103, !alias.scope !137 - %_M_p.i.i23.i.i636 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 0, i32 0 - %150 = load i8*, i8** %_M_p.i.i23.i.i636, align 8, !tbaa !107 - %151 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 2 - %arraydecay.i.i.i.i637 = bitcast %union.anon* %151 to i8* - %cmp.i.i.i638 = icmp eq i8* %150, %arraydecay.i.i.i.i637 - br i1 %cmp.i.i.i638, label %if.then.i.i640, label %if.else.i.i644 - -if.then.i.i640: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605 - %arraydecay.i.i.i639 = bitcast %union.anon* %148 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i639, i8* %150, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647 - -if.else.i.i644: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit605 - %_M_p.i21.i.i641 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 - store i8* %150, i8** %_M_p.i21.i.i641, align 8, !tbaa !107, !alias.scope !137 - %_M_allocated_capacity.i.i642 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 2, i32 0 - %152 = load i64, i64* %_M_allocated_capacity.i.i642, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i643 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 2, i32 0 - store i64 %152, i64* %_M_allocated_capacity.i.i.i643, align 8, !tbaa !63, !alias.scope !137 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647: ; preds = %if.else.i.i644, %if.then.i.i640 - %_M_string_length.i20.i.i645 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i635, i64 0, i32 1 - %153 = load i64, i64* %_M_string_length.i20.i.i645, align 8, !tbaa !104 - %_M_string_length.i.i2.i646 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 1 - store i64 %153, i64* %_M_string_length.i.i2.i646, align 8, !tbaa !104, !alias.scope !137 - %154 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i635 to %union.anon** - store %union.anon* %151, %union.anon** %154, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i645, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i637, align 1, !tbaa !87 - %_M_p.i.i.i.i648 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 0, i32 0 - %155 = load i8*, i8** %_M_p.i.i.i.i648, align 8, !tbaa !107 - %cmp.i.i.i650 = icmp eq i8* %155, %144 - br i1 %cmp.i.i.i650, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652, label %if.then.i.i651 - -if.then.i.i651: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647 - call void @_ZdlPv(i8* %155) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652: ; preds = %if.then.i.i651, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit647 - call void @llvm.lifetime.end(i64 32, i8* nonnull %141) #7 - %_M_p.i.i653 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 - %156 = load i8*, i8** %_M_p.i.i653, align 8, !tbaa !107 - %call31 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %156, i32 0, i32 128, i32 128, i32 3, i32 3) + store %union.anon* %148, %union.anon** %149, align 8, !tbaa !109, !alias.scope !155 + %_M_p.i.i23.i.i644 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 0, i32 0 + %150 = load i8*, i8** %_M_p.i.i23.i.i644, align 8, !tbaa !113 + %151 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 2 + %arraydecay.i.i.i.i645 = bitcast %union.anon* %151 to i8* + %cmp.i.i.i646 = icmp eq i8* %150, %arraydecay.i.i.i.i645 + br i1 %cmp.i.i.i646, label %if.then.i.i648, label %if.else.i.i652 + +if.then.i.i648: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613 + %arraydecay.i.i.i647 = bitcast %union.anon* %148 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i647, i8* %150, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655 + +if.else.i.i652: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit613 + %_M_p.i21.i.i649 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 + store i8* %150, i8** %_M_p.i21.i.i649, align 8, !tbaa !113, !alias.scope !155 + %_M_allocated_capacity.i.i650 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 2, i32 0 + %152 = load i64, i64* %_M_allocated_capacity.i.i650, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i651 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 2, i32 0 + store i64 %152, i64* %_M_allocated_capacity.i.i.i651, align 8, !tbaa !66, !alias.scope !155 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655: ; preds = %if.else.i.i652, %if.then.i.i648 + %_M_string_length.i20.i.i653 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i643, i64 0, i32 1 + %153 = load i64, i64* %_M_string_length.i20.i.i653, align 8, !tbaa !110 + %_M_string_length.i.i2.i654 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 1 + store i64 %153, i64* %_M_string_length.i.i2.i654, align 8, !tbaa !110, !alias.scope !155 + %154 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i643 to %union.anon** + store %union.anon* %151, %union.anon** %154, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i653, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i645, align 1, !tbaa !93 + %_M_p.i.i.i.i656 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp28, i64 0, i32 0, i32 0 + %155 = load i8*, i8** %_M_p.i.i.i.i656, align 8, !tbaa !113 + %cmp.i.i.i658 = icmp eq i8* %155, %144 + br i1 %cmp.i.i.i658, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660, label %if.then.i.i659 + +if.then.i.i659: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655 + call void @_ZdlPv(i8* %155) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660: ; preds = %if.then.i.i659, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit655 + call void @llvm.lifetime.end(i64 32, i8* nonnull %141) #2 + %_M_p.i.i661 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_w_path, i64 0, i32 0, i32 0 + %156 = load i8*, i8** %_M_p.i.i661, align 8, !tbaa !113 + %call31 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %156, i32 0, i64 128, i64 128, i64 3, i64 3) %157 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %157) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %157) #2 %158 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp32 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %158) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %158) #2 %159 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 2 %160 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp32 to %union.anon** - store %union.anon* %159, %union.anon** %160, align 8, !tbaa !103 + store %union.anon* %159, %union.anon** %160, align 8, !tbaa !109 %161 = bitcast %union.anon* %159 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %161, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.33, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i677 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i677, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %161, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.51, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i685 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i685, align 8, !tbaa !110 %162 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %162, align 2, !tbaa !87 - %163 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !140 - %164 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !140 - %call3.i.i.i682 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp32, i64 0, i64 0, i8* %164, i64 %163) #7, !noalias !140 + store i8 0, i8* %162, align 2, !tbaa !93 + %163 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !158 + %164 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !158 + %call3.i.i.i690 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp32, i64 0, i64 0, i8* %164, i64 %163) #2, !noalias !158 %165 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 2 %166 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_4_b_path to %union.anon** - store %union.anon* %165, %union.anon** %166, align 8, !tbaa !103, !alias.scope !140 - %_M_p.i.i23.i.i683 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 0, i32 0 - %167 = load i8*, i8** %_M_p.i.i23.i.i683, align 8, !tbaa !107 - %168 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 2 - %arraydecay.i.i.i.i684 = bitcast %union.anon* %168 to i8* - %cmp.i.i.i685 = icmp eq i8* %167, %arraydecay.i.i.i.i684 - br i1 %cmp.i.i.i685, label %if.then.i.i687, label %if.else.i.i691 - -if.then.i.i687: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652 - %arraydecay.i.i.i686 = bitcast %union.anon* %165 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i686, i8* %167, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694 - -if.else.i.i691: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit652 - %_M_p.i21.i.i688 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 - store i8* %167, i8** %_M_p.i21.i.i688, align 8, !tbaa !107, !alias.scope !140 - %_M_allocated_capacity.i.i689 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 2, i32 0 - %169 = load i64, i64* %_M_allocated_capacity.i.i689, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i690 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 2, i32 0 - store i64 %169, i64* %_M_allocated_capacity.i.i.i690, align 8, !tbaa !63, !alias.scope !140 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694: ; preds = %if.else.i.i691, %if.then.i.i687 - %_M_string_length.i20.i.i692 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i682, i64 0, i32 1 - %170 = load i64, i64* %_M_string_length.i20.i.i692, align 8, !tbaa !104 - %_M_string_length.i.i2.i693 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 1 - store i64 %170, i64* %_M_string_length.i.i2.i693, align 8, !tbaa !104, !alias.scope !140 - %171 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i682 to %union.anon** - store %union.anon* %168, %union.anon** %171, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i692, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i684, align 1, !tbaa !87 - %_M_p.i.i.i.i695 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 0, i32 0 - %172 = load i8*, i8** %_M_p.i.i.i.i695, align 8, !tbaa !107 - %cmp.i.i.i697 = icmp eq i8* %172, %161 - br i1 %cmp.i.i.i697, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699, label %if.then.i.i698 - -if.then.i.i698: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694 - call void @_ZdlPv(i8* %172) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699: ; preds = %if.then.i.i698, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit694 - call void @llvm.lifetime.end(i64 32, i8* nonnull %158) #7 - %_M_p.i.i700 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 - %173 = load i8*, i8** %_M_p.i.i700, align 8, !tbaa !107 - %call35 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %173, i32 0, i32 1, i32 128, i32 1, i32 1) + store %union.anon* %165, %union.anon** %166, align 8, !tbaa !109, !alias.scope !158 + %_M_p.i.i23.i.i691 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 0, i32 0 + %167 = load i8*, i8** %_M_p.i.i23.i.i691, align 8, !tbaa !113 + %168 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 2 + %arraydecay.i.i.i.i692 = bitcast %union.anon* %168 to i8* + %cmp.i.i.i693 = icmp eq i8* %167, %arraydecay.i.i.i.i692 + br i1 %cmp.i.i.i693, label %if.then.i.i695, label %if.else.i.i699 + +if.then.i.i695: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660 + %arraydecay.i.i.i694 = bitcast %union.anon* %165 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i694, i8* %167, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702 + +if.else.i.i699: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit660 + %_M_p.i21.i.i696 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 + store i8* %167, i8** %_M_p.i21.i.i696, align 8, !tbaa !113, !alias.scope !158 + %_M_allocated_capacity.i.i697 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 2, i32 0 + %169 = load i64, i64* %_M_allocated_capacity.i.i697, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i698 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 2, i32 0 + store i64 %169, i64* %_M_allocated_capacity.i.i.i698, align 8, !tbaa !66, !alias.scope !158 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702: ; preds = %if.else.i.i699, %if.then.i.i695 + %_M_string_length.i20.i.i700 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i690, i64 0, i32 1 + %170 = load i64, i64* %_M_string_length.i20.i.i700, align 8, !tbaa !110 + %_M_string_length.i.i2.i701 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 1 + store i64 %170, i64* %_M_string_length.i.i2.i701, align 8, !tbaa !110, !alias.scope !158 + %171 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i690 to %union.anon** + store %union.anon* %168, %union.anon** %171, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i700, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i692, align 1, !tbaa !93 + %_M_p.i.i.i.i703 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp32, i64 0, i32 0, i32 0 + %172 = load i8*, i8** %_M_p.i.i.i.i703, align 8, !tbaa !113 + %cmp.i.i.i705 = icmp eq i8* %172, %161 + br i1 %cmp.i.i.i705, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707, label %if.then.i.i706 + +if.then.i.i706: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702 + call void @_ZdlPv(i8* %172) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707: ; preds = %if.then.i.i706, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit702 + call void @llvm.lifetime.end(i64 32, i8* nonnull %158) #2 + %_M_p.i.i708 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_4_b_path, i64 0, i32 0, i32 0 + %173 = load i8*, i8** %_M_p.i.i708, align 8, !tbaa !113 + %call35 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %173, i32 0, i64 1, i64 128, i64 1, i64 1) %174 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %174) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %174) #2 %175 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp36 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %175) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %175) #2 %176 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 2 %177 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp36 to %union.anon** - store %union.anon* %176, %union.anon** %177, align 8, !tbaa !103 + store %union.anon* %176, %union.anon** %177, align 8, !tbaa !109 %178 = bitcast %union.anon* %176 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %178, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.34, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i724 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i724, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %178, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.52, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i732 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i732, align 8, !tbaa !110 %179 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %179, align 2, !tbaa !87 - %180 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !143 - %181 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !143 - %call3.i.i.i729 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp36, i64 0, i64 0, i8* %181, i64 %180) #7, !noalias !143 + store i8 0, i8* %179, align 2, !tbaa !93 + %180 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !161 + %181 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !161 + %call3.i.i.i737 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp36, i64 0, i64 0, i8* %181, i64 %180) #2, !noalias !161 %182 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 2 %183 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_w_path to %union.anon** - store %union.anon* %182, %union.anon** %183, align 8, !tbaa !103, !alias.scope !143 - %_M_p.i.i23.i.i730 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 0, i32 0 - %184 = load i8*, i8** %_M_p.i.i23.i.i730, align 8, !tbaa !107 - %185 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 2 - %arraydecay.i.i.i.i731 = bitcast %union.anon* %185 to i8* - %cmp.i.i.i732 = icmp eq i8* %184, %arraydecay.i.i.i.i731 - br i1 %cmp.i.i.i732, label %if.then.i.i734, label %if.else.i.i738 - -if.then.i.i734: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699 - %arraydecay.i.i.i733 = bitcast %union.anon* %182 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i733, i8* %184, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741 - -if.else.i.i738: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit699 - %_M_p.i21.i.i735 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 - store i8* %184, i8** %_M_p.i21.i.i735, align 8, !tbaa !107, !alias.scope !143 - %_M_allocated_capacity.i.i736 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 2, i32 0 - %186 = load i64, i64* %_M_allocated_capacity.i.i736, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i737 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 2, i32 0 - store i64 %186, i64* %_M_allocated_capacity.i.i.i737, align 8, !tbaa !63, !alias.scope !143 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741: ; preds = %if.else.i.i738, %if.then.i.i734 - %_M_string_length.i20.i.i739 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i729, i64 0, i32 1 - %187 = load i64, i64* %_M_string_length.i20.i.i739, align 8, !tbaa !104 - %_M_string_length.i.i2.i740 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 1 - store i64 %187, i64* %_M_string_length.i.i2.i740, align 8, !tbaa !104, !alias.scope !143 - %188 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i729 to %union.anon** - store %union.anon* %185, %union.anon** %188, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i739, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i731, align 1, !tbaa !87 - %_M_p.i.i.i.i742 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 0, i32 0 - %189 = load i8*, i8** %_M_p.i.i.i.i742, align 8, !tbaa !107 - %cmp.i.i.i744 = icmp eq i8* %189, %178 - br i1 %cmp.i.i.i744, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746, label %if.then.i.i745 - -if.then.i.i745: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741 - call void @_ZdlPv(i8* %189) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746: ; preds = %if.then.i.i745, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit741 - call void @llvm.lifetime.end(i64 32, i8* nonnull %175) #7 - %_M_p.i.i747 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 - %190 = load i8*, i8** %_M_p.i.i747, align 8, !tbaa !107 - %call39 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %190, i32 0, i32 256, i32 128, i32 3, i32 3) + store %union.anon* %182, %union.anon** %183, align 8, !tbaa !109, !alias.scope !161 + %_M_p.i.i23.i.i738 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 0, i32 0 + %184 = load i8*, i8** %_M_p.i.i23.i.i738, align 8, !tbaa !113 + %185 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 2 + %arraydecay.i.i.i.i739 = bitcast %union.anon* %185 to i8* + %cmp.i.i.i740 = icmp eq i8* %184, %arraydecay.i.i.i.i739 + br i1 %cmp.i.i.i740, label %if.then.i.i742, label %if.else.i.i746 + +if.then.i.i742: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707 + %arraydecay.i.i.i741 = bitcast %union.anon* %182 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i741, i8* %184, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749 + +if.else.i.i746: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit707 + %_M_p.i21.i.i743 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 + store i8* %184, i8** %_M_p.i21.i.i743, align 8, !tbaa !113, !alias.scope !161 + %_M_allocated_capacity.i.i744 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 2, i32 0 + %186 = load i64, i64* %_M_allocated_capacity.i.i744, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i745 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 2, i32 0 + store i64 %186, i64* %_M_allocated_capacity.i.i.i745, align 8, !tbaa !66, !alias.scope !161 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749: ; preds = %if.else.i.i746, %if.then.i.i742 + %_M_string_length.i20.i.i747 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i737, i64 0, i32 1 + %187 = load i64, i64* %_M_string_length.i20.i.i747, align 8, !tbaa !110 + %_M_string_length.i.i2.i748 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 1 + store i64 %187, i64* %_M_string_length.i.i2.i748, align 8, !tbaa !110, !alias.scope !161 + %188 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i737 to %union.anon** + store %union.anon* %185, %union.anon** %188, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i747, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i739, align 1, !tbaa !93 + %_M_p.i.i.i.i750 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp36, i64 0, i32 0, i32 0 + %189 = load i8*, i8** %_M_p.i.i.i.i750, align 8, !tbaa !113 + %cmp.i.i.i752 = icmp eq i8* %189, %178 + br i1 %cmp.i.i.i752, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754, label %if.then.i.i753 + +if.then.i.i753: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749 + call void @_ZdlPv(i8* %189) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754: ; preds = %if.then.i.i753, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit749 + call void @llvm.lifetime.end(i64 32, i8* nonnull %175) #2 + %_M_p.i.i755 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_w_path, i64 0, i32 0, i32 0 + %190 = load i8*, i8** %_M_p.i.i755, align 8, !tbaa !113 + %call39 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %190, i32 0, i64 256, i64 128, i64 3, i64 3) %191 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %191) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %191) #2 %192 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp40 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %192) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %192) #2 %193 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 2 %194 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp40 to %union.anon** - store %union.anon* %193, %union.anon** %194, align 8, !tbaa !103 + store %union.anon* %193, %union.anon** %194, align 8, !tbaa !109 %195 = bitcast %union.anon* %193 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %195, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.35, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i771 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i771, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %195, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.53, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i779 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i779, align 8, !tbaa !110 %196 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %196, align 2, !tbaa !87 - %197 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !146 - %198 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !146 - %call3.i.i.i776 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp40, i64 0, i64 0, i8* %198, i64 %197) #7, !noalias !146 + store i8 0, i8* %196, align 2, !tbaa !93 + %197 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !164 + %198 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !164 + %call3.i.i.i784 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp40, i64 0, i64 0, i8* %198, i64 %197) #2, !noalias !164 %199 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 2 %200 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_5_b_path to %union.anon** - store %union.anon* %199, %union.anon** %200, align 8, !tbaa !103, !alias.scope !146 - %_M_p.i.i23.i.i777 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 0, i32 0 - %201 = load i8*, i8** %_M_p.i.i23.i.i777, align 8, !tbaa !107 - %202 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 2 - %arraydecay.i.i.i.i778 = bitcast %union.anon* %202 to i8* - %cmp.i.i.i779 = icmp eq i8* %201, %arraydecay.i.i.i.i778 - br i1 %cmp.i.i.i779, label %if.then.i.i781, label %if.else.i.i785 - -if.then.i.i781: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746 - %arraydecay.i.i.i780 = bitcast %union.anon* %199 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i780, i8* %201, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788 - -if.else.i.i785: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit746 - %_M_p.i21.i.i782 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 - store i8* %201, i8** %_M_p.i21.i.i782, align 8, !tbaa !107, !alias.scope !146 - %_M_allocated_capacity.i.i783 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 2, i32 0 - %203 = load i64, i64* %_M_allocated_capacity.i.i783, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i784 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 2, i32 0 - store i64 %203, i64* %_M_allocated_capacity.i.i.i784, align 8, !tbaa !63, !alias.scope !146 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788: ; preds = %if.else.i.i785, %if.then.i.i781 - %_M_string_length.i20.i.i786 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i776, i64 0, i32 1 - %204 = load i64, i64* %_M_string_length.i20.i.i786, align 8, !tbaa !104 - %_M_string_length.i.i2.i787 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 1 - store i64 %204, i64* %_M_string_length.i.i2.i787, align 8, !tbaa !104, !alias.scope !146 - %205 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i776 to %union.anon** - store %union.anon* %202, %union.anon** %205, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i786, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i778, align 1, !tbaa !87 - %_M_p.i.i.i.i789 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 0, i32 0 - %206 = load i8*, i8** %_M_p.i.i.i.i789, align 8, !tbaa !107 - %cmp.i.i.i791 = icmp eq i8* %206, %195 - br i1 %cmp.i.i.i791, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793, label %if.then.i.i792 - -if.then.i.i792: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788 - call void @_ZdlPv(i8* %206) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793: ; preds = %if.then.i.i792, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit788 - call void @llvm.lifetime.end(i64 32, i8* nonnull %192) #7 - %_M_p.i.i794 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 - %207 = load i8*, i8** %_M_p.i.i794, align 8, !tbaa !107 - %call43 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %207, i32 0, i32 1, i32 256, i32 1, i32 1) + store %union.anon* %199, %union.anon** %200, align 8, !tbaa !109, !alias.scope !164 + %_M_p.i.i23.i.i785 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 0, i32 0 + %201 = load i8*, i8** %_M_p.i.i23.i.i785, align 8, !tbaa !113 + %202 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 2 + %arraydecay.i.i.i.i786 = bitcast %union.anon* %202 to i8* + %cmp.i.i.i787 = icmp eq i8* %201, %arraydecay.i.i.i.i786 + br i1 %cmp.i.i.i787, label %if.then.i.i789, label %if.else.i.i793 + +if.then.i.i789: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754 + %arraydecay.i.i.i788 = bitcast %union.anon* %199 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i788, i8* %201, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796 + +if.else.i.i793: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit754 + %_M_p.i21.i.i790 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 + store i8* %201, i8** %_M_p.i21.i.i790, align 8, !tbaa !113, !alias.scope !164 + %_M_allocated_capacity.i.i791 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 2, i32 0 + %203 = load i64, i64* %_M_allocated_capacity.i.i791, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i792 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 2, i32 0 + store i64 %203, i64* %_M_allocated_capacity.i.i.i792, align 8, !tbaa !66, !alias.scope !164 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796: ; preds = %if.else.i.i793, %if.then.i.i789 + %_M_string_length.i20.i.i794 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i784, i64 0, i32 1 + %204 = load i64, i64* %_M_string_length.i20.i.i794, align 8, !tbaa !110 + %_M_string_length.i.i2.i795 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 1 + store i64 %204, i64* %_M_string_length.i.i2.i795, align 8, !tbaa !110, !alias.scope !164 + %205 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i784 to %union.anon** + store %union.anon* %202, %union.anon** %205, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i794, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i786, align 1, !tbaa !93 + %_M_p.i.i.i.i797 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp40, i64 0, i32 0, i32 0 + %206 = load i8*, i8** %_M_p.i.i.i.i797, align 8, !tbaa !113 + %cmp.i.i.i799 = icmp eq i8* %206, %195 + br i1 %cmp.i.i.i799, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801, label %if.then.i.i800 + +if.then.i.i800: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796 + call void @_ZdlPv(i8* %206) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801: ; preds = %if.then.i.i800, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit796 + call void @llvm.lifetime.end(i64 32, i8* nonnull %192) #2 + %_M_p.i.i802 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_5_b_path, i64 0, i32 0, i32 0 + %207 = load i8*, i8** %_M_p.i.i802, align 8, !tbaa !113 + %call43 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %207, i32 0, i64 1, i64 256, i64 1, i64 1) %208 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %208) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %208) #2 %209 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp44 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %209) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %209) #2 %210 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 2 %211 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp44 to %union.anon** - store %union.anon* %210, %union.anon** %211, align 8, !tbaa !103 + store %union.anon* %210, %union.anon** %211, align 8, !tbaa !109 %212 = bitcast %union.anon* %210 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %212, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.36, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i818 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i818, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %212, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.54, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i826 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i826, align 8, !tbaa !110 %213 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %213, align 2, !tbaa !87 - %214 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !149 - %215 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !149 - %call3.i.i.i823 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp44, i64 0, i64 0, i8* %215, i64 %214) #7, !noalias !149 + store i8 0, i8* %213, align 2, !tbaa !93 + %214 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !167 + %215 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !167 + %call3.i.i.i831 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp44, i64 0, i64 0, i8* %215, i64 %214) #2, !noalias !167 %216 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 2 %217 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_w_path to %union.anon** - store %union.anon* %216, %union.anon** %217, align 8, !tbaa !103, !alias.scope !149 - %_M_p.i.i23.i.i824 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 0, i32 0 - %218 = load i8*, i8** %_M_p.i.i23.i.i824, align 8, !tbaa !107 - %219 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 2 - %arraydecay.i.i.i.i825 = bitcast %union.anon* %219 to i8* - %cmp.i.i.i826 = icmp eq i8* %218, %arraydecay.i.i.i.i825 - br i1 %cmp.i.i.i826, label %if.then.i.i828, label %if.else.i.i832 - -if.then.i.i828: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793 - %arraydecay.i.i.i827 = bitcast %union.anon* %216 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i827, i8* %218, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835 - -if.else.i.i832: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit793 - %_M_p.i21.i.i829 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 - store i8* %218, i8** %_M_p.i21.i.i829, align 8, !tbaa !107, !alias.scope !149 - %_M_allocated_capacity.i.i830 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 2, i32 0 - %220 = load i64, i64* %_M_allocated_capacity.i.i830, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i831 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 2, i32 0 - store i64 %220, i64* %_M_allocated_capacity.i.i.i831, align 8, !tbaa !63, !alias.scope !149 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835: ; preds = %if.else.i.i832, %if.then.i.i828 - %_M_string_length.i20.i.i833 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i823, i64 0, i32 1 - %221 = load i64, i64* %_M_string_length.i20.i.i833, align 8, !tbaa !104 - %_M_string_length.i.i2.i834 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 1 - store i64 %221, i64* %_M_string_length.i.i2.i834, align 8, !tbaa !104, !alias.scope !149 - %222 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i823 to %union.anon** - store %union.anon* %219, %union.anon** %222, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i833, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i825, align 1, !tbaa !87 - %_M_p.i.i.i.i836 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 0, i32 0 - %223 = load i8*, i8** %_M_p.i.i.i.i836, align 8, !tbaa !107 - %cmp.i.i.i838 = icmp eq i8* %223, %212 - br i1 %cmp.i.i.i838, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840, label %if.then.i.i839 - -if.then.i.i839: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835 - call void @_ZdlPv(i8* %223) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840: ; preds = %if.then.i.i839, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit835 - call void @llvm.lifetime.end(i64 32, i8* nonnull %209) #7 - %_M_p.i.i841 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 - %224 = load i8*, i8** %_M_p.i.i841, align 8, !tbaa !107 - %call47 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %224, i32 0, i32 256, i32 256, i32 3, i32 3) + store %union.anon* %216, %union.anon** %217, align 8, !tbaa !109, !alias.scope !167 + %_M_p.i.i23.i.i832 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 0, i32 0 + %218 = load i8*, i8** %_M_p.i.i23.i.i832, align 8, !tbaa !113 + %219 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 2 + %arraydecay.i.i.i.i833 = bitcast %union.anon* %219 to i8* + %cmp.i.i.i834 = icmp eq i8* %218, %arraydecay.i.i.i.i833 + br i1 %cmp.i.i.i834, label %if.then.i.i836, label %if.else.i.i840 + +if.then.i.i836: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801 + %arraydecay.i.i.i835 = bitcast %union.anon* %216 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i835, i8* %218, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843 + +if.else.i.i840: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit801 + %_M_p.i21.i.i837 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 + store i8* %218, i8** %_M_p.i21.i.i837, align 8, !tbaa !113, !alias.scope !167 + %_M_allocated_capacity.i.i838 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 2, i32 0 + %220 = load i64, i64* %_M_allocated_capacity.i.i838, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i839 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 2, i32 0 + store i64 %220, i64* %_M_allocated_capacity.i.i.i839, align 8, !tbaa !66, !alias.scope !167 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843: ; preds = %if.else.i.i840, %if.then.i.i836 + %_M_string_length.i20.i.i841 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i831, i64 0, i32 1 + %221 = load i64, i64* %_M_string_length.i20.i.i841, align 8, !tbaa !110 + %_M_string_length.i.i2.i842 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 1 + store i64 %221, i64* %_M_string_length.i.i2.i842, align 8, !tbaa !110, !alias.scope !167 + %222 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i831 to %union.anon** + store %union.anon* %219, %union.anon** %222, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i841, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i833, align 1, !tbaa !93 + %_M_p.i.i.i.i844 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp44, i64 0, i32 0, i32 0 + %223 = load i8*, i8** %_M_p.i.i.i.i844, align 8, !tbaa !113 + %cmp.i.i.i846 = icmp eq i8* %223, %212 + br i1 %cmp.i.i.i846, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848, label %if.then.i.i847 + +if.then.i.i847: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843 + call void @_ZdlPv(i8* %223) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848: ; preds = %if.then.i.i847, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit843 + call void @llvm.lifetime.end(i64 32, i8* nonnull %209) #2 + %_M_p.i.i849 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_w_path, i64 0, i32 0, i32 0 + %224 = load i8*, i8** %_M_p.i.i849, align 8, !tbaa !113 + %call47 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %224, i32 0, i64 256, i64 256, i64 3, i64 3) %225 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %225) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %225) #2 %226 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp48 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %226) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %226) #2 %227 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 2 %228 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp48 to %union.anon** - store %union.anon* %227, %union.anon** %228, align 8, !tbaa !103 + store %union.anon* %227, %union.anon** %228, align 8, !tbaa !109 %229 = bitcast %union.anon* %227 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %229, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.37, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i865 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i865, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %229, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.55, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i873 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i873, align 8, !tbaa !110 %230 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %230, align 2, !tbaa !87 - %231 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !152 - %232 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !152 - %call3.i.i.i870 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp48, i64 0, i64 0, i8* %232, i64 %231) #7, !noalias !152 + store i8 0, i8* %230, align 2, !tbaa !93 + %231 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !170 + %232 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !170 + %call3.i.i.i878 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp48, i64 0, i64 0, i8* %232, i64 %231) #2, !noalias !170 %233 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 2 %234 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_6_b_path to %union.anon** - store %union.anon* %233, %union.anon** %234, align 8, !tbaa !103, !alias.scope !152 - %_M_p.i.i23.i.i871 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 0, i32 0 - %235 = load i8*, i8** %_M_p.i.i23.i.i871, align 8, !tbaa !107 - %236 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 2 - %arraydecay.i.i.i.i872 = bitcast %union.anon* %236 to i8* - %cmp.i.i.i873 = icmp eq i8* %235, %arraydecay.i.i.i.i872 - br i1 %cmp.i.i.i873, label %if.then.i.i875, label %if.else.i.i879 - -if.then.i.i875: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840 - %arraydecay.i.i.i874 = bitcast %union.anon* %233 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i874, i8* %235, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882 - -if.else.i.i879: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit840 - %_M_p.i21.i.i876 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 - store i8* %235, i8** %_M_p.i21.i.i876, align 8, !tbaa !107, !alias.scope !152 - %_M_allocated_capacity.i.i877 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 2, i32 0 - %237 = load i64, i64* %_M_allocated_capacity.i.i877, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i878 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 2, i32 0 - store i64 %237, i64* %_M_allocated_capacity.i.i.i878, align 8, !tbaa !63, !alias.scope !152 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882: ; preds = %if.else.i.i879, %if.then.i.i875 - %_M_string_length.i20.i.i880 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i870, i64 0, i32 1 - %238 = load i64, i64* %_M_string_length.i20.i.i880, align 8, !tbaa !104 - %_M_string_length.i.i2.i881 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 1 - store i64 %238, i64* %_M_string_length.i.i2.i881, align 8, !tbaa !104, !alias.scope !152 - %239 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i870 to %union.anon** - store %union.anon* %236, %union.anon** %239, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i880, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i872, align 1, !tbaa !87 - %_M_p.i.i.i.i883 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 0, i32 0 - %240 = load i8*, i8** %_M_p.i.i.i.i883, align 8, !tbaa !107 - %cmp.i.i.i885 = icmp eq i8* %240, %229 - br i1 %cmp.i.i.i885, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887, label %if.then.i.i886 - -if.then.i.i886: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882 - call void @_ZdlPv(i8* %240) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887: ; preds = %if.then.i.i886, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit882 - call void @llvm.lifetime.end(i64 32, i8* nonnull %226) #7 - %_M_p.i.i888 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 - %241 = load i8*, i8** %_M_p.i.i888, align 8, !tbaa !107 - %call51 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %241, i32 0, i32 1, i32 256, i32 1, i32 1) + store %union.anon* %233, %union.anon** %234, align 8, !tbaa !109, !alias.scope !170 + %_M_p.i.i23.i.i879 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 0, i32 0 + %235 = load i8*, i8** %_M_p.i.i23.i.i879, align 8, !tbaa !113 + %236 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 2 + %arraydecay.i.i.i.i880 = bitcast %union.anon* %236 to i8* + %cmp.i.i.i881 = icmp eq i8* %235, %arraydecay.i.i.i.i880 + br i1 %cmp.i.i.i881, label %if.then.i.i883, label %if.else.i.i887 + +if.then.i.i883: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848 + %arraydecay.i.i.i882 = bitcast %union.anon* %233 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i882, i8* %235, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890 + +if.else.i.i887: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit848 + %_M_p.i21.i.i884 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 + store i8* %235, i8** %_M_p.i21.i.i884, align 8, !tbaa !113, !alias.scope !170 + %_M_allocated_capacity.i.i885 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 2, i32 0 + %237 = load i64, i64* %_M_allocated_capacity.i.i885, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i886 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 2, i32 0 + store i64 %237, i64* %_M_allocated_capacity.i.i.i886, align 8, !tbaa !66, !alias.scope !170 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890: ; preds = %if.else.i.i887, %if.then.i.i883 + %_M_string_length.i20.i.i888 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i878, i64 0, i32 1 + %238 = load i64, i64* %_M_string_length.i20.i.i888, align 8, !tbaa !110 + %_M_string_length.i.i2.i889 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 1 + store i64 %238, i64* %_M_string_length.i.i2.i889, align 8, !tbaa !110, !alias.scope !170 + %239 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i878 to %union.anon** + store %union.anon* %236, %union.anon** %239, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i888, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i880, align 1, !tbaa !93 + %_M_p.i.i.i.i891 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp48, i64 0, i32 0, i32 0 + %240 = load i8*, i8** %_M_p.i.i.i.i891, align 8, !tbaa !113 + %cmp.i.i.i893 = icmp eq i8* %240, %229 + br i1 %cmp.i.i.i893, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895, label %if.then.i.i894 + +if.then.i.i894: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890 + call void @_ZdlPv(i8* %240) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895: ; preds = %if.then.i.i894, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit890 + call void @llvm.lifetime.end(i64 32, i8* nonnull %226) #2 + %_M_p.i.i896 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_6_b_path, i64 0, i32 0, i32 0 + %241 = load i8*, i8** %_M_p.i.i896, align 8, !tbaa !113 + %call51 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %241, i32 0, i64 1, i64 256, i64 1, i64 1) %242 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %242) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %242) #2 %243 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp52 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %243) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %243) #2 %244 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 2 %245 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp52 to %union.anon** - store %union.anon* %244, %union.anon** %245, align 8, !tbaa !103 + store %union.anon* %244, %union.anon** %245, align 8, !tbaa !109 %246 = bitcast %union.anon* %244 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %246, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.38, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i912 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i912, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %246, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.56, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i920 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i920, align 8, !tbaa !110 %247 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %247, align 2, !tbaa !87 - %248 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !155 - %249 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !155 - %call3.i.i.i917 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp52, i64 0, i64 0, i8* %249, i64 %248) #7, !noalias !155 + store i8 0, i8* %247, align 2, !tbaa !93 + %248 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !173 + %249 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !173 + %call3.i.i.i925 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp52, i64 0, i64 0, i8* %249, i64 %248) #2, !noalias !173 %250 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 2 %251 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_w_path to %union.anon** - store %union.anon* %250, %union.anon** %251, align 8, !tbaa !103, !alias.scope !155 - %_M_p.i.i23.i.i918 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 0, i32 0 - %252 = load i8*, i8** %_M_p.i.i23.i.i918, align 8, !tbaa !107 - %253 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 2 - %arraydecay.i.i.i.i919 = bitcast %union.anon* %253 to i8* - %cmp.i.i.i920 = icmp eq i8* %252, %arraydecay.i.i.i.i919 - br i1 %cmp.i.i.i920, label %if.then.i.i922, label %if.else.i.i926 - -if.then.i.i922: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887 - %arraydecay.i.i.i921 = bitcast %union.anon* %250 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i921, i8* %252, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929 - -if.else.i.i926: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit887 - %_M_p.i21.i.i923 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 - store i8* %252, i8** %_M_p.i21.i.i923, align 8, !tbaa !107, !alias.scope !155 - %_M_allocated_capacity.i.i924 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 2, i32 0 - %254 = load i64, i64* %_M_allocated_capacity.i.i924, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i925 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 2, i32 0 - store i64 %254, i64* %_M_allocated_capacity.i.i.i925, align 8, !tbaa !63, !alias.scope !155 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929: ; preds = %if.else.i.i926, %if.then.i.i922 - %_M_string_length.i20.i.i927 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i917, i64 0, i32 1 - %255 = load i64, i64* %_M_string_length.i20.i.i927, align 8, !tbaa !104 - %_M_string_length.i.i2.i928 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 1 - store i64 %255, i64* %_M_string_length.i.i2.i928, align 8, !tbaa !104, !alias.scope !155 - %256 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i917 to %union.anon** - store %union.anon* %253, %union.anon** %256, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i927, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i919, align 1, !tbaa !87 - %_M_p.i.i.i.i930 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 0, i32 0 - %257 = load i8*, i8** %_M_p.i.i.i.i930, align 8, !tbaa !107 - %cmp.i.i.i932 = icmp eq i8* %257, %246 - br i1 %cmp.i.i.i932, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934, label %if.then.i.i933 - -if.then.i.i933: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929 - call void @_ZdlPv(i8* %257) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934: ; preds = %if.then.i.i933, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit929 - call void @llvm.lifetime.end(i64 32, i8* nonnull %243) #7 - %_M_p.i.i935 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 - %258 = load i8*, i8** %_M_p.i.i935, align 8, !tbaa !107 - %call55 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %258, i32 0, i32 256, i32 256, i32 3, i32 3) + store %union.anon* %250, %union.anon** %251, align 8, !tbaa !109, !alias.scope !173 + %_M_p.i.i23.i.i926 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 0, i32 0 + %252 = load i8*, i8** %_M_p.i.i23.i.i926, align 8, !tbaa !113 + %253 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 2 + %arraydecay.i.i.i.i927 = bitcast %union.anon* %253 to i8* + %cmp.i.i.i928 = icmp eq i8* %252, %arraydecay.i.i.i.i927 + br i1 %cmp.i.i.i928, label %if.then.i.i930, label %if.else.i.i934 + +if.then.i.i930: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895 + %arraydecay.i.i.i929 = bitcast %union.anon* %250 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i929, i8* %252, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937 + +if.else.i.i934: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit895 + %_M_p.i21.i.i931 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 + store i8* %252, i8** %_M_p.i21.i.i931, align 8, !tbaa !113, !alias.scope !173 + %_M_allocated_capacity.i.i932 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 2, i32 0 + %254 = load i64, i64* %_M_allocated_capacity.i.i932, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i933 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 2, i32 0 + store i64 %254, i64* %_M_allocated_capacity.i.i.i933, align 8, !tbaa !66, !alias.scope !173 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937: ; preds = %if.else.i.i934, %if.then.i.i930 + %_M_string_length.i20.i.i935 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i925, i64 0, i32 1 + %255 = load i64, i64* %_M_string_length.i20.i.i935, align 8, !tbaa !110 + %_M_string_length.i.i2.i936 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 1 + store i64 %255, i64* %_M_string_length.i.i2.i936, align 8, !tbaa !110, !alias.scope !173 + %256 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i925 to %union.anon** + store %union.anon* %253, %union.anon** %256, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i935, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i927, align 1, !tbaa !93 + %_M_p.i.i.i.i938 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp52, i64 0, i32 0, i32 0 + %257 = load i8*, i8** %_M_p.i.i.i.i938, align 8, !tbaa !113 + %cmp.i.i.i940 = icmp eq i8* %257, %246 + br i1 %cmp.i.i.i940, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942, label %if.then.i.i941 + +if.then.i.i941: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937 + call void @_ZdlPv(i8* %257) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942: ; preds = %if.then.i.i941, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit937 + call void @llvm.lifetime.end(i64 32, i8* nonnull %243) #2 + %_M_p.i.i943 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_w_path, i64 0, i32 0, i32 0 + %258 = load i8*, i8** %_M_p.i.i943, align 8, !tbaa !113 + %call55 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %258, i32 0, i64 256, i64 256, i64 3, i64 3) %259 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %259) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %259) #2 %260 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp56 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %260) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %260) #2 %261 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 2 %262 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp56 to %union.anon** - store %union.anon* %261, %union.anon** %262, align 8, !tbaa !103 + store %union.anon* %261, %union.anon** %262, align 8, !tbaa !109 %263 = bitcast %union.anon* %261 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %263, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.39, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i959 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i959, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %263, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.57, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i967 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i967, align 8, !tbaa !110 %264 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %264, align 2, !tbaa !87 - %265 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !158 - %266 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !158 - %call3.i.i.i964 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp56, i64 0, i64 0, i8* %266, i64 %265) #7, !noalias !158 + store i8 0, i8* %264, align 2, !tbaa !93 + %265 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !176 + %266 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !176 + %call3.i.i.i972 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp56, i64 0, i64 0, i8* %266, i64 %265) #2, !noalias !176 %267 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 2 %268 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_7_b_path to %union.anon** - store %union.anon* %267, %union.anon** %268, align 8, !tbaa !103, !alias.scope !158 - %_M_p.i.i23.i.i965 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 0, i32 0 - %269 = load i8*, i8** %_M_p.i.i23.i.i965, align 8, !tbaa !107 - %270 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 2 - %arraydecay.i.i.i.i966 = bitcast %union.anon* %270 to i8* - %cmp.i.i.i967 = icmp eq i8* %269, %arraydecay.i.i.i.i966 - br i1 %cmp.i.i.i967, label %if.then.i.i969, label %if.else.i.i973 - -if.then.i.i969: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934 - %arraydecay.i.i.i968 = bitcast %union.anon* %267 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i968, i8* %269, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976 - -if.else.i.i973: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit934 - %_M_p.i21.i.i970 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 - store i8* %269, i8** %_M_p.i21.i.i970, align 8, !tbaa !107, !alias.scope !158 - %_M_allocated_capacity.i.i971 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 2, i32 0 - %271 = load i64, i64* %_M_allocated_capacity.i.i971, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i972 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 2, i32 0 - store i64 %271, i64* %_M_allocated_capacity.i.i.i972, align 8, !tbaa !63, !alias.scope !158 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976: ; preds = %if.else.i.i973, %if.then.i.i969 - %_M_string_length.i20.i.i974 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i964, i64 0, i32 1 - %272 = load i64, i64* %_M_string_length.i20.i.i974, align 8, !tbaa !104 - %_M_string_length.i.i2.i975 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 1 - store i64 %272, i64* %_M_string_length.i.i2.i975, align 8, !tbaa !104, !alias.scope !158 - %273 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i964 to %union.anon** - store %union.anon* %270, %union.anon** %273, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i974, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i966, align 1, !tbaa !87 - %_M_p.i.i.i.i977 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 0, i32 0 - %274 = load i8*, i8** %_M_p.i.i.i.i977, align 8, !tbaa !107 - %cmp.i.i.i979 = icmp eq i8* %274, %263 - br i1 %cmp.i.i.i979, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981, label %if.then.i.i980 - -if.then.i.i980: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976 - call void @_ZdlPv(i8* %274) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981: ; preds = %if.then.i.i980, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit976 - call void @llvm.lifetime.end(i64 32, i8* nonnull %260) #7 - %_M_p.i.i982 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 - %275 = load i8*, i8** %_M_p.i.i982, align 8, !tbaa !107 - %call59 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %275, i32 0, i32 1, i32 256, i32 1, i32 1) + store %union.anon* %267, %union.anon** %268, align 8, !tbaa !109, !alias.scope !176 + %_M_p.i.i23.i.i973 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 0, i32 0 + %269 = load i8*, i8** %_M_p.i.i23.i.i973, align 8, !tbaa !113 + %270 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 2 + %arraydecay.i.i.i.i974 = bitcast %union.anon* %270 to i8* + %cmp.i.i.i975 = icmp eq i8* %269, %arraydecay.i.i.i.i974 + br i1 %cmp.i.i.i975, label %if.then.i.i977, label %if.else.i.i981 + +if.then.i.i977: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942 + %arraydecay.i.i.i976 = bitcast %union.anon* %267 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i976, i8* %269, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984 + +if.else.i.i981: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit942 + %_M_p.i21.i.i978 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 + store i8* %269, i8** %_M_p.i21.i.i978, align 8, !tbaa !113, !alias.scope !176 + %_M_allocated_capacity.i.i979 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 2, i32 0 + %271 = load i64, i64* %_M_allocated_capacity.i.i979, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i980 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 2, i32 0 + store i64 %271, i64* %_M_allocated_capacity.i.i.i980, align 8, !tbaa !66, !alias.scope !176 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984: ; preds = %if.else.i.i981, %if.then.i.i977 + %_M_string_length.i20.i.i982 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i972, i64 0, i32 1 + %272 = load i64, i64* %_M_string_length.i20.i.i982, align 8, !tbaa !110 + %_M_string_length.i.i2.i983 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 1 + store i64 %272, i64* %_M_string_length.i.i2.i983, align 8, !tbaa !110, !alias.scope !176 + %273 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i972 to %union.anon** + store %union.anon* %270, %union.anon** %273, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i982, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i974, align 1, !tbaa !93 + %_M_p.i.i.i.i985 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp56, i64 0, i32 0, i32 0 + %274 = load i8*, i8** %_M_p.i.i.i.i985, align 8, !tbaa !113 + %cmp.i.i.i987 = icmp eq i8* %274, %263 + br i1 %cmp.i.i.i987, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989, label %if.then.i.i988 + +if.then.i.i988: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984 + call void @_ZdlPv(i8* %274) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989: ; preds = %if.then.i.i988, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit984 + call void @llvm.lifetime.end(i64 32, i8* nonnull %260) #2 + %_M_p.i.i990 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_7_b_path, i64 0, i32 0, i32 0 + %275 = load i8*, i8** %_M_p.i.i990, align 8, !tbaa !113 + %call59 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %275, i32 0, i64 1, i64 256, i64 1, i64 1) %276 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %276) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %276) #2 %277 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp60 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %277) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %277) #2 %278 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 2 %279 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp60 to %union.anon** - store %union.anon* %278, %union.anon** %279, align 8, !tbaa !103 + store %union.anon* %278, %union.anon** %279, align 8, !tbaa !109 %280 = bitcast %union.anon* %278 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %280, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.40, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1006 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1006, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %280, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.58, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1014 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1014, align 8, !tbaa !110 %281 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %281, align 2, !tbaa !87 - %282 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !161 - %283 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !161 - %call3.i.i.i1011 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp60, i64 0, i64 0, i8* %283, i64 %282) #7, !noalias !161 + store i8 0, i8* %281, align 2, !tbaa !93 + %282 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !179 + %283 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !179 + %call3.i.i.i1019 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp60, i64 0, i64 0, i8* %283, i64 %282) #2, !noalias !179 %284 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 2 %285 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_w_path to %union.anon** - store %union.anon* %284, %union.anon** %285, align 8, !tbaa !103, !alias.scope !161 - %_M_p.i.i23.i.i1012 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 0, i32 0 - %286 = load i8*, i8** %_M_p.i.i23.i.i1012, align 8, !tbaa !107 - %287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 2 - %arraydecay.i.i.i.i1013 = bitcast %union.anon* %287 to i8* - %cmp.i.i.i1014 = icmp eq i8* %286, %arraydecay.i.i.i.i1013 - br i1 %cmp.i.i.i1014, label %if.then.i.i1016, label %if.else.i.i1020 - -if.then.i.i1016: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981 - %arraydecay.i.i.i1015 = bitcast %union.anon* %284 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1015, i8* %286, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023 - -if.else.i.i1020: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit981 - %_M_p.i21.i.i1017 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 - store i8* %286, i8** %_M_p.i21.i.i1017, align 8, !tbaa !107, !alias.scope !161 - %_M_allocated_capacity.i.i1018 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 2, i32 0 - %288 = load i64, i64* %_M_allocated_capacity.i.i1018, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1019 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 2, i32 0 - store i64 %288, i64* %_M_allocated_capacity.i.i.i1019, align 8, !tbaa !63, !alias.scope !161 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023: ; preds = %if.else.i.i1020, %if.then.i.i1016 - %_M_string_length.i20.i.i1021 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1011, i64 0, i32 1 - %289 = load i64, i64* %_M_string_length.i20.i.i1021, align 8, !tbaa !104 - %_M_string_length.i.i2.i1022 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 1 - store i64 %289, i64* %_M_string_length.i.i2.i1022, align 8, !tbaa !104, !alias.scope !161 - %290 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1011 to %union.anon** - store %union.anon* %287, %union.anon** %290, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1021, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1013, align 1, !tbaa !87 - %_M_p.i.i.i.i1024 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 0, i32 0 - %291 = load i8*, i8** %_M_p.i.i.i.i1024, align 8, !tbaa !107 - %cmp.i.i.i1026 = icmp eq i8* %291, %280 - br i1 %cmp.i.i.i1026, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028, label %if.then.i.i1027 - -if.then.i.i1027: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023 - call void @_ZdlPv(i8* %291) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028: ; preds = %if.then.i.i1027, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1023 - call void @llvm.lifetime.end(i64 32, i8* nonnull %277) #7 - %_M_p.i.i1029 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 - %292 = load i8*, i8** %_M_p.i.i1029, align 8, !tbaa !107 - %call63 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %292, i32 0, i32 512, i32 256, i32 3, i32 3) + store %union.anon* %284, %union.anon** %285, align 8, !tbaa !109, !alias.scope !179 + %_M_p.i.i23.i.i1020 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 0, i32 0 + %286 = load i8*, i8** %_M_p.i.i23.i.i1020, align 8, !tbaa !113 + %287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 2 + %arraydecay.i.i.i.i1021 = bitcast %union.anon* %287 to i8* + %cmp.i.i.i1022 = icmp eq i8* %286, %arraydecay.i.i.i.i1021 + br i1 %cmp.i.i.i1022, label %if.then.i.i1024, label %if.else.i.i1028 + +if.then.i.i1024: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989 + %arraydecay.i.i.i1023 = bitcast %union.anon* %284 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1023, i8* %286, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031 + +if.else.i.i1028: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit989 + %_M_p.i21.i.i1025 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 + store i8* %286, i8** %_M_p.i21.i.i1025, align 8, !tbaa !113, !alias.scope !179 + %_M_allocated_capacity.i.i1026 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 2, i32 0 + %288 = load i64, i64* %_M_allocated_capacity.i.i1026, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1027 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 2, i32 0 + store i64 %288, i64* %_M_allocated_capacity.i.i.i1027, align 8, !tbaa !66, !alias.scope !179 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031: ; preds = %if.else.i.i1028, %if.then.i.i1024 + %_M_string_length.i20.i.i1029 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1019, i64 0, i32 1 + %289 = load i64, i64* %_M_string_length.i20.i.i1029, align 8, !tbaa !110 + %_M_string_length.i.i2.i1030 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 1 + store i64 %289, i64* %_M_string_length.i.i2.i1030, align 8, !tbaa !110, !alias.scope !179 + %290 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1019 to %union.anon** + store %union.anon* %287, %union.anon** %290, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1029, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1021, align 1, !tbaa !93 + %_M_p.i.i.i.i1032 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp60, i64 0, i32 0, i32 0 + %291 = load i8*, i8** %_M_p.i.i.i.i1032, align 8, !tbaa !113 + %cmp.i.i.i1034 = icmp eq i8* %291, %280 + br i1 %cmp.i.i.i1034, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036, label %if.then.i.i1035 + +if.then.i.i1035: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031 + call void @_ZdlPv(i8* %291) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036: ; preds = %if.then.i.i1035, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1031 + call void @llvm.lifetime.end(i64 32, i8* nonnull %277) #2 + %_M_p.i.i1037 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_w_path, i64 0, i32 0, i32 0 + %292 = load i8*, i8** %_M_p.i.i1037, align 8, !tbaa !113 + %call63 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %292, i32 0, i64 512, i64 256, i64 3, i64 3) %293 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %293) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %293) #2 %294 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp64 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %294) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %294) #2 %295 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 2 %296 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp64 to %union.anon** - store %union.anon* %295, %union.anon** %296, align 8, !tbaa !103 + store %union.anon* %295, %union.anon** %296, align 8, !tbaa !109 %297 = bitcast %union.anon* %295 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %297, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.41, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1048 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1048, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %297, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.59, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1056 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1056, align 8, !tbaa !110 %298 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %298, align 2, !tbaa !87 - %299 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !164 - %300 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !164 - %call3.i.i.i1053 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp64, i64 0, i64 0, i8* %300, i64 %299) #7, !noalias !164 + store i8 0, i8* %298, align 2, !tbaa !93 + %299 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !182 + %300 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !182 + %call3.i.i.i1061 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp64, i64 0, i64 0, i8* %300, i64 %299) #2, !noalias !182 %301 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 2 %302 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_8_b_path to %union.anon** - store %union.anon* %301, %union.anon** %302, align 8, !tbaa !103, !alias.scope !164 - %_M_p.i.i23.i.i1054 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 0, i32 0 - %303 = load i8*, i8** %_M_p.i.i23.i.i1054, align 8, !tbaa !107 - %304 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 2 - %arraydecay.i.i.i.i1055 = bitcast %union.anon* %304 to i8* - %cmp.i.i.i1056 = icmp eq i8* %303, %arraydecay.i.i.i.i1055 - br i1 %cmp.i.i.i1056, label %if.then.i.i1058, label %if.else.i.i1062 - -if.then.i.i1058: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028 - %arraydecay.i.i.i1057 = bitcast %union.anon* %301 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1057, i8* %303, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065 - -if.else.i.i1062: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1028 - %_M_p.i21.i.i1059 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 - store i8* %303, i8** %_M_p.i21.i.i1059, align 8, !tbaa !107, !alias.scope !164 - %_M_allocated_capacity.i.i1060 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 2, i32 0 - %305 = load i64, i64* %_M_allocated_capacity.i.i1060, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1061 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 2, i32 0 - store i64 %305, i64* %_M_allocated_capacity.i.i.i1061, align 8, !tbaa !63, !alias.scope !164 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065: ; preds = %if.else.i.i1062, %if.then.i.i1058 - %_M_string_length.i20.i.i1063 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1053, i64 0, i32 1 - %306 = load i64, i64* %_M_string_length.i20.i.i1063, align 8, !tbaa !104 - %_M_string_length.i.i2.i1064 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 1 - store i64 %306, i64* %_M_string_length.i.i2.i1064, align 8, !tbaa !104, !alias.scope !164 - %307 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1053 to %union.anon** - store %union.anon* %304, %union.anon** %307, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1063, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1055, align 1, !tbaa !87 - %_M_p.i.i.i.i1066 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 0, i32 0 - %308 = load i8*, i8** %_M_p.i.i.i.i1066, align 8, !tbaa !107 - %cmp.i.i.i1068 = icmp eq i8* %308, %297 - br i1 %cmp.i.i.i1068, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070, label %if.then.i.i1069 - -if.then.i.i1069: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065 - call void @_ZdlPv(i8* %308) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070: ; preds = %if.then.i.i1069, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1065 - call void @llvm.lifetime.end(i64 32, i8* nonnull %294) #7 - %_M_p.i.i1071 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 - %309 = load i8*, i8** %_M_p.i.i1071, align 8, !tbaa !107 - %call67 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %309, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %301, %union.anon** %302, align 8, !tbaa !109, !alias.scope !182 + %_M_p.i.i23.i.i1062 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 0, i32 0 + %303 = load i8*, i8** %_M_p.i.i23.i.i1062, align 8, !tbaa !113 + %304 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 2 + %arraydecay.i.i.i.i1063 = bitcast %union.anon* %304 to i8* + %cmp.i.i.i1064 = icmp eq i8* %303, %arraydecay.i.i.i.i1063 + br i1 %cmp.i.i.i1064, label %if.then.i.i1066, label %if.else.i.i1070 + +if.then.i.i1066: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036 + %arraydecay.i.i.i1065 = bitcast %union.anon* %301 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1065, i8* %303, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073 + +if.else.i.i1070: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1036 + %_M_p.i21.i.i1067 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 + store i8* %303, i8** %_M_p.i21.i.i1067, align 8, !tbaa !113, !alias.scope !182 + %_M_allocated_capacity.i.i1068 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 2, i32 0 + %305 = load i64, i64* %_M_allocated_capacity.i.i1068, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1069 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 2, i32 0 + store i64 %305, i64* %_M_allocated_capacity.i.i.i1069, align 8, !tbaa !66, !alias.scope !182 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073: ; preds = %if.else.i.i1070, %if.then.i.i1066 + %_M_string_length.i20.i.i1071 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1061, i64 0, i32 1 + %306 = load i64, i64* %_M_string_length.i20.i.i1071, align 8, !tbaa !110 + %_M_string_length.i.i2.i1072 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 1 + store i64 %306, i64* %_M_string_length.i.i2.i1072, align 8, !tbaa !110, !alias.scope !182 + %307 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1061 to %union.anon** + store %union.anon* %304, %union.anon** %307, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1071, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1063, align 1, !tbaa !93 + %_M_p.i.i.i.i1074 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp64, i64 0, i32 0, i32 0 + %308 = load i8*, i8** %_M_p.i.i.i.i1074, align 8, !tbaa !113 + %cmp.i.i.i1076 = icmp eq i8* %308, %297 + br i1 %cmp.i.i.i1076, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078, label %if.then.i.i1077 + +if.then.i.i1077: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073 + call void @_ZdlPv(i8* %308) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078: ; preds = %if.then.i.i1077, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1073 + call void @llvm.lifetime.end(i64 32, i8* nonnull %294) #2 + %_M_p.i.i1079 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_8_b_path, i64 0, i32 0, i32 0 + %309 = load i8*, i8** %_M_p.i.i1079, align 8, !tbaa !113 + %call67 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %309, i32 0, i64 1, i64 512, i64 1, i64 1) %310 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %310) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %310) #2 %311 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp68 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %311) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %311) #2 %312 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 2 %313 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp68 to %union.anon** - store %union.anon* %312, %union.anon** %313, align 8, !tbaa !103 + store %union.anon* %312, %union.anon** %313, align 8, !tbaa !109 %314 = bitcast %union.anon* %312 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %314, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.42, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1085 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1085, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %314, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.60, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1094 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1094, align 8, !tbaa !110 %315 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %315, align 2, !tbaa !87 - %316 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !167 - %317 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !167 - %call3.i.i.i1090 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp68, i64 0, i64 0, i8* %317, i64 %316) #7, !noalias !167 + store i8 0, i8* %315, align 2, !tbaa !93 + %316 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !185 + %317 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !185 + %call3.i.i.i1099 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp68, i64 0, i64 0, i8* %317, i64 %316) #2, !noalias !185 %318 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 2 %319 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_w_path to %union.anon** - store %union.anon* %318, %union.anon** %319, align 8, !tbaa !103, !alias.scope !167 - %_M_p.i.i23.i.i1091 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 0, i32 0 - %320 = load i8*, i8** %_M_p.i.i23.i.i1091, align 8, !tbaa !107 - %321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 2 - %arraydecay.i.i.i.i1092 = bitcast %union.anon* %321 to i8* - %cmp.i.i.i1093 = icmp eq i8* %320, %arraydecay.i.i.i.i1092 - br i1 %cmp.i.i.i1093, label %if.then.i.i1095, label %if.else.i.i1099 - -if.then.i.i1095: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070 - %arraydecay.i.i.i1094 = bitcast %union.anon* %318 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1094, i8* %320, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102 - -if.else.i.i1099: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1070 - %_M_p.i21.i.i1096 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 - store i8* %320, i8** %_M_p.i21.i.i1096, align 8, !tbaa !107, !alias.scope !167 - %_M_allocated_capacity.i.i1097 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 2, i32 0 - %322 = load i64, i64* %_M_allocated_capacity.i.i1097, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1098 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 2, i32 0 - store i64 %322, i64* %_M_allocated_capacity.i.i.i1098, align 8, !tbaa !63, !alias.scope !167 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102: ; preds = %if.else.i.i1099, %if.then.i.i1095 - %_M_string_length.i20.i.i1100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1090, i64 0, i32 1 - %323 = load i64, i64* %_M_string_length.i20.i.i1100, align 8, !tbaa !104 - %_M_string_length.i.i2.i1101 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 1 - store i64 %323, i64* %_M_string_length.i.i2.i1101, align 8, !tbaa !104, !alias.scope !167 - %324 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1090 to %union.anon** - store %union.anon* %321, %union.anon** %324, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1100, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1092, align 1, !tbaa !87 - %_M_p.i.i.i.i1103 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 0, i32 0 - %325 = load i8*, i8** %_M_p.i.i.i.i1103, align 8, !tbaa !107 - %cmp.i.i.i1105 = icmp eq i8* %325, %314 - br i1 %cmp.i.i.i1105, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107, label %if.then.i.i1106 - -if.then.i.i1106: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102 - call void @_ZdlPv(i8* %325) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107: ; preds = %if.then.i.i1106, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1102 - call void @llvm.lifetime.end(i64 32, i8* nonnull %311) #7 - %_M_p.i.i1108 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 - %326 = load i8*, i8** %_M_p.i.i1108, align 8, !tbaa !107 - %call71 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %326, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %318, %union.anon** %319, align 8, !tbaa !109, !alias.scope !185 + %_M_p.i.i23.i.i1100 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 0, i32 0 + %320 = load i8*, i8** %_M_p.i.i23.i.i1100, align 8, !tbaa !113 + %321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 2 + %arraydecay.i.i.i.i1101 = bitcast %union.anon* %321 to i8* + %cmp.i.i.i1102 = icmp eq i8* %320, %arraydecay.i.i.i.i1101 + br i1 %cmp.i.i.i1102, label %if.then.i.i1104, label %if.else.i.i1108 + +if.then.i.i1104: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078 + %arraydecay.i.i.i1103 = bitcast %union.anon* %318 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1103, i8* %320, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111 + +if.else.i.i1108: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1078 + %_M_p.i21.i.i1105 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 + store i8* %320, i8** %_M_p.i21.i.i1105, align 8, !tbaa !113, !alias.scope !185 + %_M_allocated_capacity.i.i1106 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 2, i32 0 + %322 = load i64, i64* %_M_allocated_capacity.i.i1106, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1107 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 2, i32 0 + store i64 %322, i64* %_M_allocated_capacity.i.i.i1107, align 8, !tbaa !66, !alias.scope !185 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111: ; preds = %if.else.i.i1108, %if.then.i.i1104 + %_M_string_length.i20.i.i1109 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1099, i64 0, i32 1 + %323 = load i64, i64* %_M_string_length.i20.i.i1109, align 8, !tbaa !110 + %_M_string_length.i.i2.i1110 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 1 + store i64 %323, i64* %_M_string_length.i.i2.i1110, align 8, !tbaa !110, !alias.scope !185 + %324 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1099 to %union.anon** + store %union.anon* %321, %union.anon** %324, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1109, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1101, align 1, !tbaa !93 + %_M_p.i.i.i.i1112 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp68, i64 0, i32 0, i32 0 + %325 = load i8*, i8** %_M_p.i.i.i.i1112, align 8, !tbaa !113 + %cmp.i.i.i1114 = icmp eq i8* %325, %314 + br i1 %cmp.i.i.i1114, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116, label %if.then.i.i1115 + +if.then.i.i1115: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111 + call void @_ZdlPv(i8* %325) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116: ; preds = %if.then.i.i1115, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1111 + call void @llvm.lifetime.end(i64 32, i8* nonnull %311) #2 + %_M_p.i.i1117 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_w_path, i64 0, i32 0, i32 0 + %326 = load i8*, i8** %_M_p.i.i1117, align 8, !tbaa !113 + %call71 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %326, i32 0, i64 512, i64 512, i64 3, i64 3) %327 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %327) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %327) #2 %328 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp72 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %328) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %328) #2 %329 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 2 %330 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp72 to %union.anon** - store %union.anon* %329, %union.anon** %330, align 8, !tbaa !103 + store %union.anon* %329, %union.anon** %330, align 8, !tbaa !109 %331 = bitcast %union.anon* %329 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %331, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.43, i64 0, i64 0), i64 14, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1122 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 1 - store i64 14, i64* %_M_string_length.i.i.i.i.i.i1122, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %331, i8* nonnull getelementptr inbounds ([15 x i8], [15 x i8]* @.str.61, i64 0, i64 0), i64 14, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1131 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 1 + store i64 14, i64* %_M_string_length.i.i.i.i.i.i1131, align 8, !tbaa !110 %332 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 2, i32 1, i64 6 - store i8 0, i8* %332, align 2, !tbaa !87 - %333 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !170 - %334 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !170 - %call3.i.i.i1127 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp72, i64 0, i64 0, i8* %334, i64 %333) #7, !noalias !170 + store i8 0, i8* %332, align 2, !tbaa !93 + %333 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !188 + %334 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !188 + %call3.i.i.i1136 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp72, i64 0, i64 0, i8* %334, i64 %333) #2, !noalias !188 %335 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 2 %336 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_9_b_path to %union.anon** - store %union.anon* %335, %union.anon** %336, align 8, !tbaa !103, !alias.scope !170 - %_M_p.i.i23.i.i1128 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 0, i32 0 - %337 = load i8*, i8** %_M_p.i.i23.i.i1128, align 8, !tbaa !107 - %338 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 2 - %arraydecay.i.i.i.i1129 = bitcast %union.anon* %338 to i8* - %cmp.i.i.i1130 = icmp eq i8* %337, %arraydecay.i.i.i.i1129 - br i1 %cmp.i.i.i1130, label %if.then.i.i1132, label %if.else.i.i1136 - -if.then.i.i1132: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107 - %arraydecay.i.i.i1131 = bitcast %union.anon* %335 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1131, i8* %337, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139 - -if.else.i.i1136: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1107 - %_M_p.i21.i.i1133 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 - store i8* %337, i8** %_M_p.i21.i.i1133, align 8, !tbaa !107, !alias.scope !170 - %_M_allocated_capacity.i.i1134 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 2, i32 0 - %339 = load i64, i64* %_M_allocated_capacity.i.i1134, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1135 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 2, i32 0 - store i64 %339, i64* %_M_allocated_capacity.i.i.i1135, align 8, !tbaa !63, !alias.scope !170 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139: ; preds = %if.else.i.i1136, %if.then.i.i1132 - %_M_string_length.i20.i.i1137 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1127, i64 0, i32 1 - %340 = load i64, i64* %_M_string_length.i20.i.i1137, align 8, !tbaa !104 - %_M_string_length.i.i2.i1138 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 1 - store i64 %340, i64* %_M_string_length.i.i2.i1138, align 8, !tbaa !104, !alias.scope !170 - %341 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1127 to %union.anon** - store %union.anon* %338, %union.anon** %341, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1137, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1129, align 1, !tbaa !87 - %_M_p.i.i.i.i1140 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 0, i32 0 - %342 = load i8*, i8** %_M_p.i.i.i.i1140, align 8, !tbaa !107 - %cmp.i.i.i1142 = icmp eq i8* %342, %331 - br i1 %cmp.i.i.i1142, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144, label %if.then.i.i1143 - -if.then.i.i1143: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139 - call void @_ZdlPv(i8* %342) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144: ; preds = %if.then.i.i1143, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1139 - call void @llvm.lifetime.end(i64 32, i8* nonnull %328) #7 - %_M_p.i.i1145 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 - %343 = load i8*, i8** %_M_p.i.i1145, align 8, !tbaa !107 - %call75 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %343, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %335, %union.anon** %336, align 8, !tbaa !109, !alias.scope !188 + %_M_p.i.i23.i.i1137 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 0, i32 0 + %337 = load i8*, i8** %_M_p.i.i23.i.i1137, align 8, !tbaa !113 + %338 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 2 + %arraydecay.i.i.i.i1138 = bitcast %union.anon* %338 to i8* + %cmp.i.i.i1139 = icmp eq i8* %337, %arraydecay.i.i.i.i1138 + br i1 %cmp.i.i.i1139, label %if.then.i.i1141, label %if.else.i.i1145 + +if.then.i.i1141: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116 + %arraydecay.i.i.i1140 = bitcast %union.anon* %335 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1140, i8* %337, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148 + +if.else.i.i1145: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1116 + %_M_p.i21.i.i1142 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 + store i8* %337, i8** %_M_p.i21.i.i1142, align 8, !tbaa !113, !alias.scope !188 + %_M_allocated_capacity.i.i1143 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 2, i32 0 + %339 = load i64, i64* %_M_allocated_capacity.i.i1143, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1144 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 2, i32 0 + store i64 %339, i64* %_M_allocated_capacity.i.i.i1144, align 8, !tbaa !66, !alias.scope !188 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148: ; preds = %if.else.i.i1145, %if.then.i.i1141 + %_M_string_length.i20.i.i1146 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1136, i64 0, i32 1 + %340 = load i64, i64* %_M_string_length.i20.i.i1146, align 8, !tbaa !110 + %_M_string_length.i.i2.i1147 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 1 + store i64 %340, i64* %_M_string_length.i.i2.i1147, align 8, !tbaa !110, !alias.scope !188 + %341 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1136 to %union.anon** + store %union.anon* %338, %union.anon** %341, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1146, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1138, align 1, !tbaa !93 + %_M_p.i.i.i.i1149 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp72, i64 0, i32 0, i32 0 + %342 = load i8*, i8** %_M_p.i.i.i.i1149, align 8, !tbaa !113 + %cmp.i.i.i1151 = icmp eq i8* %342, %331 + br i1 %cmp.i.i.i1151, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153, label %if.then.i.i1152 + +if.then.i.i1152: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148 + call void @_ZdlPv(i8* %342) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153: ; preds = %if.then.i.i1152, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1148 + call void @llvm.lifetime.end(i64 32, i8* nonnull %328) #2 + %_M_p.i.i1154 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_9_b_path, i64 0, i32 0, i32 0 + %343 = load i8*, i8** %_M_p.i.i1154, align 8, !tbaa !113 + %call75 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %343, i32 0, i64 1, i64 512, i64 1, i64 1) %344 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %344) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %344) #2 %345 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp76 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %345) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %345) #2 %346 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 2 %347 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp76 to %union.anon** - store %union.anon* %346, %union.anon** %347, align 8, !tbaa !103 + store %union.anon* %346, %union.anon** %347, align 8, !tbaa !109 %348 = bitcast %union.anon* %346 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %348, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.44, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1159 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1159, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %348, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.62, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1168 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1168, align 8, !tbaa !110 %349 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %349, align 1, !tbaa !87 - %350 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !173 - %351 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !173 - %call3.i.i.i1164 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp76, i64 0, i64 0, i8* %351, i64 %350) #7, !noalias !173 + store i8 0, i8* %349, align 1, !tbaa !93 + %350 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !191 + %351 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !191 + %call3.i.i.i1173 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp76, i64 0, i64 0, i8* %351, i64 %350) #2, !noalias !191 %352 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 2 %353 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_w_path to %union.anon** - store %union.anon* %352, %union.anon** %353, align 8, !tbaa !103, !alias.scope !173 - %_M_p.i.i23.i.i1165 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 0, i32 0 - %354 = load i8*, i8** %_M_p.i.i23.i.i1165, align 8, !tbaa !107 - %355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 2 - %arraydecay.i.i.i.i1166 = bitcast %union.anon* %355 to i8* - %cmp.i.i.i1167 = icmp eq i8* %354, %arraydecay.i.i.i.i1166 - br i1 %cmp.i.i.i1167, label %if.then.i.i1169, label %if.else.i.i1173 - -if.then.i.i1169: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144 - %arraydecay.i.i.i1168 = bitcast %union.anon* %352 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1168, i8* %354, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176 - -if.else.i.i1173: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1144 - %_M_p.i21.i.i1170 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 - store i8* %354, i8** %_M_p.i21.i.i1170, align 8, !tbaa !107, !alias.scope !173 - %_M_allocated_capacity.i.i1171 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 2, i32 0 - %356 = load i64, i64* %_M_allocated_capacity.i.i1171, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1172 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 2, i32 0 - store i64 %356, i64* %_M_allocated_capacity.i.i.i1172, align 8, !tbaa !63, !alias.scope !173 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176: ; preds = %if.else.i.i1173, %if.then.i.i1169 - %_M_string_length.i20.i.i1174 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1164, i64 0, i32 1 - %357 = load i64, i64* %_M_string_length.i20.i.i1174, align 8, !tbaa !104 - %_M_string_length.i.i2.i1175 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 1 - store i64 %357, i64* %_M_string_length.i.i2.i1175, align 8, !tbaa !104, !alias.scope !173 - %358 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1164 to %union.anon** - store %union.anon* %355, %union.anon** %358, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1174, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1166, align 1, !tbaa !87 - %_M_p.i.i.i.i1177 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 0, i32 0 - %359 = load i8*, i8** %_M_p.i.i.i.i1177, align 8, !tbaa !107 - %cmp.i.i.i1179 = icmp eq i8* %359, %348 - br i1 %cmp.i.i.i1179, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181, label %if.then.i.i1180 - -if.then.i.i1180: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176 - call void @_ZdlPv(i8* %359) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181: ; preds = %if.then.i.i1180, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1176 - call void @llvm.lifetime.end(i64 32, i8* nonnull %345) #7 - %_M_p.i.i1182 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 - %360 = load i8*, i8** %_M_p.i.i1182, align 8, !tbaa !107 - %call79 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %360, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %352, %union.anon** %353, align 8, !tbaa !109, !alias.scope !191 + %_M_p.i.i23.i.i1174 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 0, i32 0 + %354 = load i8*, i8** %_M_p.i.i23.i.i1174, align 8, !tbaa !113 + %355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 2 + %arraydecay.i.i.i.i1175 = bitcast %union.anon* %355 to i8* + %cmp.i.i.i1176 = icmp eq i8* %354, %arraydecay.i.i.i.i1175 + br i1 %cmp.i.i.i1176, label %if.then.i.i1178, label %if.else.i.i1182 + +if.then.i.i1178: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153 + %arraydecay.i.i.i1177 = bitcast %union.anon* %352 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1177, i8* %354, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185 + +if.else.i.i1182: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1153 + %_M_p.i21.i.i1179 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 + store i8* %354, i8** %_M_p.i21.i.i1179, align 8, !tbaa !113, !alias.scope !191 + %_M_allocated_capacity.i.i1180 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 2, i32 0 + %356 = load i64, i64* %_M_allocated_capacity.i.i1180, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1181 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 2, i32 0 + store i64 %356, i64* %_M_allocated_capacity.i.i.i1181, align 8, !tbaa !66, !alias.scope !191 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185: ; preds = %if.else.i.i1182, %if.then.i.i1178 + %_M_string_length.i20.i.i1183 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1173, i64 0, i32 1 + %357 = load i64, i64* %_M_string_length.i20.i.i1183, align 8, !tbaa !110 + %_M_string_length.i.i2.i1184 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 1 + store i64 %357, i64* %_M_string_length.i.i2.i1184, align 8, !tbaa !110, !alias.scope !191 + %358 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1173 to %union.anon** + store %union.anon* %355, %union.anon** %358, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1183, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1175, align 1, !tbaa !93 + %_M_p.i.i.i.i1186 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp76, i64 0, i32 0, i32 0 + %359 = load i8*, i8** %_M_p.i.i.i.i1186, align 8, !tbaa !113 + %cmp.i.i.i1188 = icmp eq i8* %359, %348 + br i1 %cmp.i.i.i1188, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190, label %if.then.i.i1189 + +if.then.i.i1189: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185 + call void @_ZdlPv(i8* %359) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190: ; preds = %if.then.i.i1189, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1185 + call void @llvm.lifetime.end(i64 32, i8* nonnull %345) #2 + %_M_p.i.i1191 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_w_path, i64 0, i32 0, i32 0 + %360 = load i8*, i8** %_M_p.i.i1191, align 8, !tbaa !113 + %call79 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %360, i32 0, i64 512, i64 512, i64 3, i64 3) %361 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %361) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %361) #2 %362 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp80 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %362) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %362) #2 %363 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 2 %364 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp80 to %union.anon** - store %union.anon* %363, %union.anon** %364, align 8, !tbaa !103 + store %union.anon* %363, %union.anon** %364, align 8, !tbaa !109 %365 = bitcast %union.anon* %363 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %365, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.45, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1198 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1198, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %365, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.63, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1206 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1206, align 8, !tbaa !110 %366 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %366, align 1, !tbaa !87 - %367 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !176 - %368 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !176 - %call3.i.i.i1203 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp80, i64 0, i64 0, i8* %368, i64 %367) #7, !noalias !176 + store i8 0, i8* %366, align 1, !tbaa !93 + %367 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !194 + %368 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !194 + %call3.i.i.i1211 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp80, i64 0, i64 0, i8* %368, i64 %367) #2, !noalias !194 %369 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 2 %370 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_10_b_path to %union.anon** - store %union.anon* %369, %union.anon** %370, align 8, !tbaa !103, !alias.scope !176 - %_M_p.i.i23.i.i1204 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 0, i32 0 - %371 = load i8*, i8** %_M_p.i.i23.i.i1204, align 8, !tbaa !107 - %372 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 2 - %arraydecay.i.i.i.i1205 = bitcast %union.anon* %372 to i8* - %cmp.i.i.i1206 = icmp eq i8* %371, %arraydecay.i.i.i.i1205 - br i1 %cmp.i.i.i1206, label %if.then.i.i1208, label %if.else.i.i1212 - -if.then.i.i1208: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181 - %arraydecay.i.i.i1207 = bitcast %union.anon* %369 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1207, i8* %371, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215 - -if.else.i.i1212: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1181 - %_M_p.i21.i.i1209 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 - store i8* %371, i8** %_M_p.i21.i.i1209, align 8, !tbaa !107, !alias.scope !176 - %_M_allocated_capacity.i.i1210 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 2, i32 0 - %373 = load i64, i64* %_M_allocated_capacity.i.i1210, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1211 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 2, i32 0 - store i64 %373, i64* %_M_allocated_capacity.i.i.i1211, align 8, !tbaa !63, !alias.scope !176 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215: ; preds = %if.else.i.i1212, %if.then.i.i1208 - %_M_string_length.i20.i.i1213 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1203, i64 0, i32 1 - %374 = load i64, i64* %_M_string_length.i20.i.i1213, align 8, !tbaa !104 - %_M_string_length.i.i2.i1214 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 1 - store i64 %374, i64* %_M_string_length.i.i2.i1214, align 8, !tbaa !104, !alias.scope !176 - %375 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1203 to %union.anon** - store %union.anon* %372, %union.anon** %375, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1213, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1205, align 1, !tbaa !87 - %_M_p.i.i.i.i1216 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 0, i32 0 - %376 = load i8*, i8** %_M_p.i.i.i.i1216, align 8, !tbaa !107 - %cmp.i.i.i1218 = icmp eq i8* %376, %365 - br i1 %cmp.i.i.i1218, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220, label %if.then.i.i1219 - -if.then.i.i1219: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215 - call void @_ZdlPv(i8* %376) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220: ; preds = %if.then.i.i1219, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1215 - call void @llvm.lifetime.end(i64 32, i8* nonnull %362) #7 - %_M_p.i.i1221 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 - %377 = load i8*, i8** %_M_p.i.i1221, align 8, !tbaa !107 - %call83 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %377, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %369, %union.anon** %370, align 8, !tbaa !109, !alias.scope !194 + %_M_p.i.i23.i.i1212 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 0, i32 0 + %371 = load i8*, i8** %_M_p.i.i23.i.i1212, align 8, !tbaa !113 + %372 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 2 + %arraydecay.i.i.i.i1213 = bitcast %union.anon* %372 to i8* + %cmp.i.i.i1214 = icmp eq i8* %371, %arraydecay.i.i.i.i1213 + br i1 %cmp.i.i.i1214, label %if.then.i.i1216, label %if.else.i.i1220 + +if.then.i.i1216: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190 + %arraydecay.i.i.i1215 = bitcast %union.anon* %369 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1215, i8* %371, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223 + +if.else.i.i1220: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1190 + %_M_p.i21.i.i1217 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 + store i8* %371, i8** %_M_p.i21.i.i1217, align 8, !tbaa !113, !alias.scope !194 + %_M_allocated_capacity.i.i1218 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 2, i32 0 + %373 = load i64, i64* %_M_allocated_capacity.i.i1218, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1219 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 2, i32 0 + store i64 %373, i64* %_M_allocated_capacity.i.i.i1219, align 8, !tbaa !66, !alias.scope !194 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223: ; preds = %if.else.i.i1220, %if.then.i.i1216 + %_M_string_length.i20.i.i1221 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1211, i64 0, i32 1 + %374 = load i64, i64* %_M_string_length.i20.i.i1221, align 8, !tbaa !110 + %_M_string_length.i.i2.i1222 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 1 + store i64 %374, i64* %_M_string_length.i.i2.i1222, align 8, !tbaa !110, !alias.scope !194 + %375 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1211 to %union.anon** + store %union.anon* %372, %union.anon** %375, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1221, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1213, align 1, !tbaa !93 + %_M_p.i.i.i.i1224 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp80, i64 0, i32 0, i32 0 + %376 = load i8*, i8** %_M_p.i.i.i.i1224, align 8, !tbaa !113 + %cmp.i.i.i1226 = icmp eq i8* %376, %365 + br i1 %cmp.i.i.i1226, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228, label %if.then.i.i1227 + +if.then.i.i1227: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223 + call void @_ZdlPv(i8* %376) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228: ; preds = %if.then.i.i1227, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1223 + call void @llvm.lifetime.end(i64 32, i8* nonnull %362) #2 + %_M_p.i.i1229 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_10_b_path, i64 0, i32 0, i32 0 + %377 = load i8*, i8** %_M_p.i.i1229, align 8, !tbaa !113 + %call83 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %377, i32 0, i64 1, i64 512, i64 1, i64 1) %378 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %378) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %378) #2 %379 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp84 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %379) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %379) #2 %380 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 2 %381 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp84 to %union.anon** - store %union.anon* %380, %union.anon** %381, align 8, !tbaa !103 + store %union.anon* %380, %union.anon** %381, align 8, !tbaa !109 %382 = bitcast %union.anon* %380 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %382, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.46, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1256 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1256, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %382, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.64, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1250 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1250, align 8, !tbaa !110 %383 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %383, align 1, !tbaa !87 - %384 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !179 - %385 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !179 - %call3.i.i.i1261 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp84, i64 0, i64 0, i8* %385, i64 %384) #7, !noalias !179 + store i8 0, i8* %383, align 1, !tbaa !93 + %384 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !197 + %385 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !197 + %call3.i.i.i1255 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp84, i64 0, i64 0, i8* %385, i64 %384) #2, !noalias !197 %386 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 2 %387 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_w_path to %union.anon** - store %union.anon* %386, %union.anon** %387, align 8, !tbaa !103, !alias.scope !179 - %_M_p.i.i23.i.i1262 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 0, i32 0 - %388 = load i8*, i8** %_M_p.i.i23.i.i1262, align 8, !tbaa !107 - %389 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 2 - %arraydecay.i.i.i.i1263 = bitcast %union.anon* %389 to i8* - %cmp.i.i.i1264 = icmp eq i8* %388, %arraydecay.i.i.i.i1263 - br i1 %cmp.i.i.i1264, label %if.then.i.i1266, label %if.else.i.i1270 - -if.then.i.i1266: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220 - %arraydecay.i.i.i1265 = bitcast %union.anon* %386 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1265, i8* %388, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273 - -if.else.i.i1270: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1220 - %_M_p.i21.i.i1267 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 - store i8* %388, i8** %_M_p.i21.i.i1267, align 8, !tbaa !107, !alias.scope !179 - %_M_allocated_capacity.i.i1268 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 2, i32 0 - %390 = load i64, i64* %_M_allocated_capacity.i.i1268, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1269 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 2, i32 0 - store i64 %390, i64* %_M_allocated_capacity.i.i.i1269, align 8, !tbaa !63, !alias.scope !179 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273: ; preds = %if.else.i.i1270, %if.then.i.i1266 - %_M_string_length.i20.i.i1271 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1261, i64 0, i32 1 - %391 = load i64, i64* %_M_string_length.i20.i.i1271, align 8, !tbaa !104 - %_M_string_length.i.i2.i1272 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 1 - store i64 %391, i64* %_M_string_length.i.i2.i1272, align 8, !tbaa !104, !alias.scope !179 - %392 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1261 to %union.anon** - store %union.anon* %389, %union.anon** %392, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1271, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1263, align 1, !tbaa !87 - %_M_p.i.i.i.i1274 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 0, i32 0 - %393 = load i8*, i8** %_M_p.i.i.i.i1274, align 8, !tbaa !107 - %cmp.i.i.i1276 = icmp eq i8* %393, %382 - br i1 %cmp.i.i.i1276, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278, label %if.then.i.i1277 - -if.then.i.i1277: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273 - call void @_ZdlPv(i8* %393) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278: ; preds = %if.then.i.i1277, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1273 - call void @llvm.lifetime.end(i64 32, i8* nonnull %379) #7 - %_M_p.i.i1279 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 - %394 = load i8*, i8** %_M_p.i.i1279, align 8, !tbaa !107 - %call87 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %394, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %386, %union.anon** %387, align 8, !tbaa !109, !alias.scope !197 + %_M_p.i.i23.i.i1256 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 0, i32 0 + %388 = load i8*, i8** %_M_p.i.i23.i.i1256, align 8, !tbaa !113 + %389 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 2 + %arraydecay.i.i.i.i1257 = bitcast %union.anon* %389 to i8* + %cmp.i.i.i1258 = icmp eq i8* %388, %arraydecay.i.i.i.i1257 + br i1 %cmp.i.i.i1258, label %if.then.i.i1260, label %if.else.i.i1264 + +if.then.i.i1260: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228 + %arraydecay.i.i.i1259 = bitcast %union.anon* %386 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1259, i8* %388, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267 + +if.else.i.i1264: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1228 + %_M_p.i21.i.i1261 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 + store i8* %388, i8** %_M_p.i21.i.i1261, align 8, !tbaa !113, !alias.scope !197 + %_M_allocated_capacity.i.i1262 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 2, i32 0 + %390 = load i64, i64* %_M_allocated_capacity.i.i1262, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1263 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 2, i32 0 + store i64 %390, i64* %_M_allocated_capacity.i.i.i1263, align 8, !tbaa !66, !alias.scope !197 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267: ; preds = %if.else.i.i1264, %if.then.i.i1260 + %_M_string_length.i20.i.i1265 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1255, i64 0, i32 1 + %391 = load i64, i64* %_M_string_length.i20.i.i1265, align 8, !tbaa !110 + %_M_string_length.i.i2.i1266 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 1 + store i64 %391, i64* %_M_string_length.i.i2.i1266, align 8, !tbaa !110, !alias.scope !197 + %392 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1255 to %union.anon** + store %union.anon* %389, %union.anon** %392, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1265, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1257, align 1, !tbaa !93 + %_M_p.i.i.i.i1268 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp84, i64 0, i32 0, i32 0 + %393 = load i8*, i8** %_M_p.i.i.i.i1268, align 8, !tbaa !113 + %cmp.i.i.i1270 = icmp eq i8* %393, %382 + br i1 %cmp.i.i.i1270, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272, label %if.then.i.i1271 + +if.then.i.i1271: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267 + call void @_ZdlPv(i8* %393) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272: ; preds = %if.then.i.i1271, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1267 + call void @llvm.lifetime.end(i64 32, i8* nonnull %379) #2 + %_M_p.i.i1273 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_w_path, i64 0, i32 0, i32 0 + %394 = load i8*, i8** %_M_p.i.i1273, align 8, !tbaa !113 + %call87 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %394, i32 0, i64 512, i64 512, i64 3, i64 3) %395 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %395) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %395) #2 %396 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp88 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %396) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %396) #2 %397 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 2 %398 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp88 to %union.anon** - store %union.anon* %397, %union.anon** %398, align 8, !tbaa !103 + store %union.anon* %397, %union.anon** %398, align 8, !tbaa !109 %399 = bitcast %union.anon* %397 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %399, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.47, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1309 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1309, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %399, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.65, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1318 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1318, align 8, !tbaa !110 %400 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %400, align 1, !tbaa !87 - %401 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !182 - %402 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !182 - %call3.i.i.i1314 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp88, i64 0, i64 0, i8* %402, i64 %401) #7, !noalias !182 + store i8 0, i8* %400, align 1, !tbaa !93 + %401 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !200 + %402 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !200 + %call3.i.i.i1323 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp88, i64 0, i64 0, i8* %402, i64 %401) #2, !noalias !200 %403 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 2 %404 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_11_b_path to %union.anon** - store %union.anon* %403, %union.anon** %404, align 8, !tbaa !103, !alias.scope !182 - %_M_p.i.i23.i.i1315 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 0, i32 0 - %405 = load i8*, i8** %_M_p.i.i23.i.i1315, align 8, !tbaa !107 - %406 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 2 - %arraydecay.i.i.i.i1316 = bitcast %union.anon* %406 to i8* - %cmp.i.i.i1317 = icmp eq i8* %405, %arraydecay.i.i.i.i1316 - br i1 %cmp.i.i.i1317, label %if.then.i.i1319, label %if.else.i.i1323 - -if.then.i.i1319: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278 - %arraydecay.i.i.i1318 = bitcast %union.anon* %403 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1318, i8* %405, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326 - -if.else.i.i1323: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1278 - %_M_p.i21.i.i1320 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 - store i8* %405, i8** %_M_p.i21.i.i1320, align 8, !tbaa !107, !alias.scope !182 - %_M_allocated_capacity.i.i1321 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 2, i32 0 - %407 = load i64, i64* %_M_allocated_capacity.i.i1321, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1322 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 2, i32 0 - store i64 %407, i64* %_M_allocated_capacity.i.i.i1322, align 8, !tbaa !63, !alias.scope !182 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326: ; preds = %if.else.i.i1323, %if.then.i.i1319 - %_M_string_length.i20.i.i1324 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1314, i64 0, i32 1 - %408 = load i64, i64* %_M_string_length.i20.i.i1324, align 8, !tbaa !104 - %_M_string_length.i.i2.i1325 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 1 - store i64 %408, i64* %_M_string_length.i.i2.i1325, align 8, !tbaa !104, !alias.scope !182 - %409 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1314 to %union.anon** - store %union.anon* %406, %union.anon** %409, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1324, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1316, align 1, !tbaa !87 - %_M_p.i.i.i.i1327 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 0, i32 0 - %410 = load i8*, i8** %_M_p.i.i.i.i1327, align 8, !tbaa !107 - %cmp.i.i.i1329 = icmp eq i8* %410, %399 - br i1 %cmp.i.i.i1329, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331, label %if.then.i.i1330 - -if.then.i.i1330: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326 - call void @_ZdlPv(i8* %410) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331: ; preds = %if.then.i.i1330, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1326 - call void @llvm.lifetime.end(i64 32, i8* nonnull %396) #7 - %_M_p.i.i1332 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 - %411 = load i8*, i8** %_M_p.i.i1332, align 8, !tbaa !107 - %call91 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %411, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %403, %union.anon** %404, align 8, !tbaa !109, !alias.scope !200 + %_M_p.i.i23.i.i1324 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 0, i32 0 + %405 = load i8*, i8** %_M_p.i.i23.i.i1324, align 8, !tbaa !113 + %406 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 2 + %arraydecay.i.i.i.i1325 = bitcast %union.anon* %406 to i8* + %cmp.i.i.i1326 = icmp eq i8* %405, %arraydecay.i.i.i.i1325 + br i1 %cmp.i.i.i1326, label %if.then.i.i1328, label %if.else.i.i1332 + +if.then.i.i1328: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272 + %arraydecay.i.i.i1327 = bitcast %union.anon* %403 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1327, i8* %405, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335 + +if.else.i.i1332: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1272 + %_M_p.i21.i.i1329 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 + store i8* %405, i8** %_M_p.i21.i.i1329, align 8, !tbaa !113, !alias.scope !200 + %_M_allocated_capacity.i.i1330 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 2, i32 0 + %407 = load i64, i64* %_M_allocated_capacity.i.i1330, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1331 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 2, i32 0 + store i64 %407, i64* %_M_allocated_capacity.i.i.i1331, align 8, !tbaa !66, !alias.scope !200 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335: ; preds = %if.else.i.i1332, %if.then.i.i1328 + %_M_string_length.i20.i.i1333 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1323, i64 0, i32 1 + %408 = load i64, i64* %_M_string_length.i20.i.i1333, align 8, !tbaa !110 + %_M_string_length.i.i2.i1334 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 1 + store i64 %408, i64* %_M_string_length.i.i2.i1334, align 8, !tbaa !110, !alias.scope !200 + %409 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1323 to %union.anon** + store %union.anon* %406, %union.anon** %409, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1333, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1325, align 1, !tbaa !93 + %_M_p.i.i.i.i1336 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp88, i64 0, i32 0, i32 0 + %410 = load i8*, i8** %_M_p.i.i.i.i1336, align 8, !tbaa !113 + %cmp.i.i.i1338 = icmp eq i8* %410, %399 + br i1 %cmp.i.i.i1338, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340, label %if.then.i.i1339 + +if.then.i.i1339: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335 + call void @_ZdlPv(i8* %410) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340: ; preds = %if.then.i.i1339, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1335 + call void @llvm.lifetime.end(i64 32, i8* nonnull %396) #2 + %_M_p.i.i1341 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_11_b_path, i64 0, i32 0, i32 0 + %411 = load i8*, i8** %_M_p.i.i1341, align 8, !tbaa !113 + %call91 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %411, i32 0, i64 1, i64 512, i64 1, i64 1) %412 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %412) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %412) #2 %413 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp92 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %413) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %413) #2 %414 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 2 %415 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp92 to %union.anon** - store %union.anon* %414, %union.anon** %415, align 8, !tbaa !103 + store %union.anon* %414, %union.anon** %415, align 8, !tbaa !109 %416 = bitcast %union.anon* %414 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %416, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.48, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1352 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1352, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %416, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.66, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1355 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1355, align 8, !tbaa !110 %417 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %417, align 1, !tbaa !87 - %418 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !185 - %419 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !185 - %call3.i.i.i1357 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp92, i64 0, i64 0, i8* %419, i64 %418) #7, !noalias !185 + store i8 0, i8* %417, align 1, !tbaa !93 + %418 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !203 + %419 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !203 + %call3.i.i.i1360 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp92, i64 0, i64 0, i8* %419, i64 %418) #2, !noalias !203 %420 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 2 %421 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_w_path to %union.anon** - store %union.anon* %420, %union.anon** %421, align 8, !tbaa !103, !alias.scope !185 - %_M_p.i.i23.i.i1358 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 0, i32 0 - %422 = load i8*, i8** %_M_p.i.i23.i.i1358, align 8, !tbaa !107 - %423 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 2 - %arraydecay.i.i.i.i1359 = bitcast %union.anon* %423 to i8* - %cmp.i.i.i1360 = icmp eq i8* %422, %arraydecay.i.i.i.i1359 - br i1 %cmp.i.i.i1360, label %if.then.i.i1362, label %if.else.i.i1366 - -if.then.i.i1362: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331 - %arraydecay.i.i.i1361 = bitcast %union.anon* %420 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1361, i8* %422, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369 - -if.else.i.i1366: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1331 - %_M_p.i21.i.i1363 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 - store i8* %422, i8** %_M_p.i21.i.i1363, align 8, !tbaa !107, !alias.scope !185 - %_M_allocated_capacity.i.i1364 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 2, i32 0 - %424 = load i64, i64* %_M_allocated_capacity.i.i1364, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1365 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 2, i32 0 - store i64 %424, i64* %_M_allocated_capacity.i.i.i1365, align 8, !tbaa !63, !alias.scope !185 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369: ; preds = %if.else.i.i1366, %if.then.i.i1362 - %_M_string_length.i20.i.i1367 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1357, i64 0, i32 1 - %425 = load i64, i64* %_M_string_length.i20.i.i1367, align 8, !tbaa !104 - %_M_string_length.i.i2.i1368 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 1 - store i64 %425, i64* %_M_string_length.i.i2.i1368, align 8, !tbaa !104, !alias.scope !185 - %426 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1357 to %union.anon** - store %union.anon* %423, %union.anon** %426, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1367, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1359, align 1, !tbaa !87 - %_M_p.i.i.i.i1370 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 0, i32 0 - %427 = load i8*, i8** %_M_p.i.i.i.i1370, align 8, !tbaa !107 - %cmp.i.i.i1372 = icmp eq i8* %427, %416 - br i1 %cmp.i.i.i1372, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374, label %if.then.i.i1373 - -if.then.i.i1373: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369 - call void @_ZdlPv(i8* %427) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374: ; preds = %if.then.i.i1373, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1369 - call void @llvm.lifetime.end(i64 32, i8* nonnull %413) #7 - %_M_p.i.i1375 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 - %428 = load i8*, i8** %_M_p.i.i1375, align 8, !tbaa !107 - %call95 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %428, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %420, %union.anon** %421, align 8, !tbaa !109, !alias.scope !203 + %_M_p.i.i23.i.i1361 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 0, i32 0 + %422 = load i8*, i8** %_M_p.i.i23.i.i1361, align 8, !tbaa !113 + %423 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 2 + %arraydecay.i.i.i.i1362 = bitcast %union.anon* %423 to i8* + %cmp.i.i.i1363 = icmp eq i8* %422, %arraydecay.i.i.i.i1362 + br i1 %cmp.i.i.i1363, label %if.then.i.i1365, label %if.else.i.i1369 + +if.then.i.i1365: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340 + %arraydecay.i.i.i1364 = bitcast %union.anon* %420 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1364, i8* %422, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372 + +if.else.i.i1369: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1340 + %_M_p.i21.i.i1366 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 + store i8* %422, i8** %_M_p.i21.i.i1366, align 8, !tbaa !113, !alias.scope !203 + %_M_allocated_capacity.i.i1367 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 2, i32 0 + %424 = load i64, i64* %_M_allocated_capacity.i.i1367, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1368 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 2, i32 0 + store i64 %424, i64* %_M_allocated_capacity.i.i.i1368, align 8, !tbaa !66, !alias.scope !203 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372: ; preds = %if.else.i.i1369, %if.then.i.i1365 + %_M_string_length.i20.i.i1370 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1360, i64 0, i32 1 + %425 = load i64, i64* %_M_string_length.i20.i.i1370, align 8, !tbaa !110 + %_M_string_length.i.i2.i1371 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 1 + store i64 %425, i64* %_M_string_length.i.i2.i1371, align 8, !tbaa !110, !alias.scope !203 + %426 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1360 to %union.anon** + store %union.anon* %423, %union.anon** %426, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1370, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1362, align 1, !tbaa !93 + %_M_p.i.i.i.i1373 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp92, i64 0, i32 0, i32 0 + %427 = load i8*, i8** %_M_p.i.i.i.i1373, align 8, !tbaa !113 + %cmp.i.i.i1375 = icmp eq i8* %427, %416 + br i1 %cmp.i.i.i1375, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377, label %if.then.i.i1376 + +if.then.i.i1376: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372 + call void @_ZdlPv(i8* %427) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377: ; preds = %if.then.i.i1376, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1372 + call void @llvm.lifetime.end(i64 32, i8* nonnull %413) #2 + %_M_p.i.i1378 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_w_path, i64 0, i32 0, i32 0 + %428 = load i8*, i8** %_M_p.i.i1378, align 8, !tbaa !113 + %call95 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %428, i32 0, i64 512, i64 512, i64 3, i64 3) %429 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %429) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %429) #2 %430 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp96 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %430) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %430) #2 %431 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 2 %432 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp96 to %union.anon** - store %union.anon* %431, %union.anon** %432, align 8, !tbaa !103 + store %union.anon* %431, %union.anon** %432, align 8, !tbaa !109 %433 = bitcast %union.anon* %431 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %433, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.49, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1420 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1420, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %433, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.67, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1413 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1413, align 8, !tbaa !110 %434 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %434, align 1, !tbaa !87 - %435 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !188 - %436 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !188 - %call3.i.i.i1425 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp96, i64 0, i64 0, i8* %436, i64 %435) #7, !noalias !188 + store i8 0, i8* %434, align 1, !tbaa !93 + %435 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !206 + %436 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !206 + %call3.i.i.i1418 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp96, i64 0, i64 0, i8* %436, i64 %435) #2, !noalias !206 %437 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 2 %438 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_12_b_path to %union.anon** - store %union.anon* %437, %union.anon** %438, align 8, !tbaa !103, !alias.scope !188 - %_M_p.i.i23.i.i1426 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 0, i32 0 - %439 = load i8*, i8** %_M_p.i.i23.i.i1426, align 8, !tbaa !107 - %440 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 2 - %arraydecay.i.i.i.i1427 = bitcast %union.anon* %440 to i8* - %cmp.i.i.i1428 = icmp eq i8* %439, %arraydecay.i.i.i.i1427 - br i1 %cmp.i.i.i1428, label %if.then.i.i1430, label %if.else.i.i1434 - -if.then.i.i1430: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374 - %arraydecay.i.i.i1429 = bitcast %union.anon* %437 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1429, i8* %439, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437 - -if.else.i.i1434: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1374 - %_M_p.i21.i.i1431 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 - store i8* %439, i8** %_M_p.i21.i.i1431, align 8, !tbaa !107, !alias.scope !188 - %_M_allocated_capacity.i.i1432 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 2, i32 0 - %441 = load i64, i64* %_M_allocated_capacity.i.i1432, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1433 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 2, i32 0 - store i64 %441, i64* %_M_allocated_capacity.i.i.i1433, align 8, !tbaa !63, !alias.scope !188 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437: ; preds = %if.else.i.i1434, %if.then.i.i1430 - %_M_string_length.i20.i.i1435 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1425, i64 0, i32 1 - %442 = load i64, i64* %_M_string_length.i20.i.i1435, align 8, !tbaa !104 - %_M_string_length.i.i2.i1436 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 1 - store i64 %442, i64* %_M_string_length.i.i2.i1436, align 8, !tbaa !104, !alias.scope !188 - %443 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1425 to %union.anon** - store %union.anon* %440, %union.anon** %443, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1435, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1427, align 1, !tbaa !87 - %_M_p.i.i.i.i1438 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 0, i32 0 - %444 = load i8*, i8** %_M_p.i.i.i.i1438, align 8, !tbaa !107 - %cmp.i.i.i1440 = icmp eq i8* %444, %433 - br i1 %cmp.i.i.i1440, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442, label %if.then.i.i1441 - -if.then.i.i1441: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437 - call void @_ZdlPv(i8* %444) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442: ; preds = %if.then.i.i1441, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1437 - call void @llvm.lifetime.end(i64 32, i8* nonnull %430) #7 - %_M_p.i.i1443 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 - %445 = load i8*, i8** %_M_p.i.i1443, align 8, !tbaa !107 - %call99 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %445, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %437, %union.anon** %438, align 8, !tbaa !109, !alias.scope !206 + %_M_p.i.i23.i.i1419 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 0, i32 0 + %439 = load i8*, i8** %_M_p.i.i23.i.i1419, align 8, !tbaa !113 + %440 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 2 + %arraydecay.i.i.i.i1420 = bitcast %union.anon* %440 to i8* + %cmp.i.i.i1421 = icmp eq i8* %439, %arraydecay.i.i.i.i1420 + br i1 %cmp.i.i.i1421, label %if.then.i.i1423, label %if.else.i.i1427 + +if.then.i.i1423: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377 + %arraydecay.i.i.i1422 = bitcast %union.anon* %437 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1422, i8* %439, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430 + +if.else.i.i1427: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1377 + %_M_p.i21.i.i1424 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 + store i8* %439, i8** %_M_p.i21.i.i1424, align 8, !tbaa !113, !alias.scope !206 + %_M_allocated_capacity.i.i1425 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 2, i32 0 + %441 = load i64, i64* %_M_allocated_capacity.i.i1425, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1426 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 2, i32 0 + store i64 %441, i64* %_M_allocated_capacity.i.i.i1426, align 8, !tbaa !66, !alias.scope !206 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430: ; preds = %if.else.i.i1427, %if.then.i.i1423 + %_M_string_length.i20.i.i1428 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1418, i64 0, i32 1 + %442 = load i64, i64* %_M_string_length.i20.i.i1428, align 8, !tbaa !110 + %_M_string_length.i.i2.i1429 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 1 + store i64 %442, i64* %_M_string_length.i.i2.i1429, align 8, !tbaa !110, !alias.scope !206 + %443 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1418 to %union.anon** + store %union.anon* %440, %union.anon** %443, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1428, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1420, align 1, !tbaa !93 + %_M_p.i.i.i.i1431 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp96, i64 0, i32 0, i32 0 + %444 = load i8*, i8** %_M_p.i.i.i.i1431, align 8, !tbaa !113 + %cmp.i.i.i1433 = icmp eq i8* %444, %433 + br i1 %cmp.i.i.i1433, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435, label %if.then.i.i1434 + +if.then.i.i1434: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430 + call void @_ZdlPv(i8* %444) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435: ; preds = %if.then.i.i1434, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1430 + call void @llvm.lifetime.end(i64 32, i8* nonnull %430) #2 + %_M_p.i.i1436 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_12_b_path, i64 0, i32 0, i32 0 + %445 = load i8*, i8** %_M_p.i.i1436, align 8, !tbaa !113 + %call99 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %445, i32 0, i64 1, i64 512, i64 1, i64 1) %446 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %446) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %446) #2 %447 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp100 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %447) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %447) #2 %448 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 2 %449 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp100 to %union.anon** - store %union.anon* %448, %union.anon** %449, align 8, !tbaa !103 + store %union.anon* %448, %union.anon** %449, align 8, !tbaa !109 %450 = bitcast %union.anon* %448 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %450, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.50, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1457, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %450, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.68, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1466 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1466, align 8, !tbaa !110 %451 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %451, align 1, !tbaa !87 - %452 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !191 - %453 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !191 - %call3.i.i.i1462 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp100, i64 0, i64 0, i8* %453, i64 %452) #7, !noalias !191 + store i8 0, i8* %451, align 1, !tbaa !93 + %452 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !209 + %453 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !209 + %call3.i.i.i1471 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp100, i64 0, i64 0, i8* %453, i64 %452) #2, !noalias !209 %454 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 2 %455 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_w_path to %union.anon** - store %union.anon* %454, %union.anon** %455, align 8, !tbaa !103, !alias.scope !191 - %_M_p.i.i23.i.i1463 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 0, i32 0 - %456 = load i8*, i8** %_M_p.i.i23.i.i1463, align 8, !tbaa !107 - %457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 2 - %arraydecay.i.i.i.i1464 = bitcast %union.anon* %457 to i8* - %cmp.i.i.i1465 = icmp eq i8* %456, %arraydecay.i.i.i.i1464 - br i1 %cmp.i.i.i1465, label %if.then.i.i1467, label %if.else.i.i1471 - -if.then.i.i1467: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442 - %arraydecay.i.i.i1466 = bitcast %union.anon* %454 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1466, i8* %456, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474 - -if.else.i.i1471: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1442 - %_M_p.i21.i.i1468 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 - store i8* %456, i8** %_M_p.i21.i.i1468, align 8, !tbaa !107, !alias.scope !191 - %_M_allocated_capacity.i.i1469 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 2, i32 0 - %458 = load i64, i64* %_M_allocated_capacity.i.i1469, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1470 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 2, i32 0 - store i64 %458, i64* %_M_allocated_capacity.i.i.i1470, align 8, !tbaa !63, !alias.scope !191 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474: ; preds = %if.else.i.i1471, %if.then.i.i1467 - %_M_string_length.i20.i.i1472 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1462, i64 0, i32 1 - %459 = load i64, i64* %_M_string_length.i20.i.i1472, align 8, !tbaa !104 - %_M_string_length.i.i2.i1473 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 1 - store i64 %459, i64* %_M_string_length.i.i2.i1473, align 8, !tbaa !104, !alias.scope !191 - %460 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1462 to %union.anon** - store %union.anon* %457, %union.anon** %460, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1472, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1464, align 1, !tbaa !87 - %_M_p.i.i.i.i1475 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 0, i32 0 - %461 = load i8*, i8** %_M_p.i.i.i.i1475, align 8, !tbaa !107 - %cmp.i.i.i1477 = icmp eq i8* %461, %450 - br i1 %cmp.i.i.i1477, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479, label %if.then.i.i1478 - -if.then.i.i1478: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474 - call void @_ZdlPv(i8* %461) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479: ; preds = %if.then.i.i1478, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1474 - call void @llvm.lifetime.end(i64 32, i8* nonnull %447) #7 - %_M_p.i.i1480 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 - %462 = load i8*, i8** %_M_p.i.i1480, align 8, !tbaa !107 - %call103 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %462, i32 0, i32 512, i32 512, i32 3, i32 3) + store %union.anon* %454, %union.anon** %455, align 8, !tbaa !109, !alias.scope !209 + %_M_p.i.i23.i.i1472 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 0, i32 0 + %456 = load i8*, i8** %_M_p.i.i23.i.i1472, align 8, !tbaa !113 + %457 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 2 + %arraydecay.i.i.i.i1473 = bitcast %union.anon* %457 to i8* + %cmp.i.i.i1474 = icmp eq i8* %456, %arraydecay.i.i.i.i1473 + br i1 %cmp.i.i.i1474, label %if.then.i.i1476, label %if.else.i.i1480 + +if.then.i.i1476: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435 + %arraydecay.i.i.i1475 = bitcast %union.anon* %454 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1475, i8* %456, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483 + +if.else.i.i1480: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1435 + %_M_p.i21.i.i1477 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 + store i8* %456, i8** %_M_p.i21.i.i1477, align 8, !tbaa !113, !alias.scope !209 + %_M_allocated_capacity.i.i1478 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 2, i32 0 + %458 = load i64, i64* %_M_allocated_capacity.i.i1478, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1479 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 2, i32 0 + store i64 %458, i64* %_M_allocated_capacity.i.i.i1479, align 8, !tbaa !66, !alias.scope !209 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483: ; preds = %if.else.i.i1480, %if.then.i.i1476 + %_M_string_length.i20.i.i1481 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1471, i64 0, i32 1 + %459 = load i64, i64* %_M_string_length.i20.i.i1481, align 8, !tbaa !110 + %_M_string_length.i.i2.i1482 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 1 + store i64 %459, i64* %_M_string_length.i.i2.i1482, align 8, !tbaa !110, !alias.scope !209 + %460 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1471 to %union.anon** + store %union.anon* %457, %union.anon** %460, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1481, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1473, align 1, !tbaa !93 + %_M_p.i.i.i.i1484 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp100, i64 0, i32 0, i32 0 + %461 = load i8*, i8** %_M_p.i.i.i.i1484, align 8, !tbaa !113 + %cmp.i.i.i1486 = icmp eq i8* %461, %450 + br i1 %cmp.i.i.i1486, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488, label %if.then.i.i1487 + +if.then.i.i1487: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483 + call void @_ZdlPv(i8* %461) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488: ; preds = %if.then.i.i1487, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1483 + call void @llvm.lifetime.end(i64 32, i8* nonnull %447) #2 + %_M_p.i.i1489 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_w_path, i64 0, i32 0, i32 0 + %462 = load i8*, i8** %_M_p.i.i1489, align 8, !tbaa !113 + %call103 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %462, i32 0, i64 512, i64 512, i64 3, i64 3) %463 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %463) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %463) #2 %464 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp104 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %464) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %464) #2 %465 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 2 %466 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp104 to %union.anon** - store %union.anon* %465, %union.anon** %466, align 8, !tbaa !103 + store %union.anon* %465, %union.anon** %466, align 8, !tbaa !109 %467 = bitcast %union.anon* %465 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %467, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.51, i64 0, i64 0), i64 15, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1515 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 1 - store i64 15, i64* %_M_string_length.i.i.i.i.i.i1515, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %467, i8* nonnull getelementptr inbounds ([16 x i8], [16 x i8]* @.str.69, i64 0, i64 0), i64 15, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1509 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 1 + store i64 15, i64* %_M_string_length.i.i.i.i.i.i1509, align 8, !tbaa !110 %468 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 2, i32 1, i64 7 - store i8 0, i8* %468, align 1, !tbaa !87 - %469 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !194 - %470 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !194 - %call3.i.i.i1520 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp104, i64 0, i64 0, i8* %470, i64 %469) #7, !noalias !194 + store i8 0, i8* %468, align 1, !tbaa !93 + %469 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !212 + %470 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !212 + %call3.i.i.i1514 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp104, i64 0, i64 0, i8* %470, i64 %469) #2, !noalias !212 %471 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 2 %472 = bitcast %"class.std::__cxx11::basic_string"* %conv2d_13_b_path to %union.anon** - store %union.anon* %471, %union.anon** %472, align 8, !tbaa !103, !alias.scope !194 - %_M_p.i.i23.i.i1521 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 0, i32 0 - %473 = load i8*, i8** %_M_p.i.i23.i.i1521, align 8, !tbaa !107 - %474 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 2 - %arraydecay.i.i.i.i1522 = bitcast %union.anon* %474 to i8* - %cmp.i.i.i1523 = icmp eq i8* %473, %arraydecay.i.i.i.i1522 - br i1 %cmp.i.i.i1523, label %if.then.i.i1525, label %if.else.i.i1529 - -if.then.i.i1525: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479 - %arraydecay.i.i.i1524 = bitcast %union.anon* %471 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1524, i8* %473, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532 - -if.else.i.i1529: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1479 - %_M_p.i21.i.i1526 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 - store i8* %473, i8** %_M_p.i21.i.i1526, align 8, !tbaa !107, !alias.scope !194 - %_M_allocated_capacity.i.i1527 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 2, i32 0 - %475 = load i64, i64* %_M_allocated_capacity.i.i1527, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1528 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 2, i32 0 - store i64 %475, i64* %_M_allocated_capacity.i.i.i1528, align 8, !tbaa !63, !alias.scope !194 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532: ; preds = %if.else.i.i1529, %if.then.i.i1525 - %_M_string_length.i20.i.i1530 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1520, i64 0, i32 1 - %476 = load i64, i64* %_M_string_length.i20.i.i1530, align 8, !tbaa !104 - %_M_string_length.i.i2.i1531 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 1 - store i64 %476, i64* %_M_string_length.i.i2.i1531, align 8, !tbaa !104, !alias.scope !194 - %477 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1520 to %union.anon** - store %union.anon* %474, %union.anon** %477, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1530, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1522, align 1, !tbaa !87 - %_M_p.i.i.i.i1533 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 0, i32 0 - %478 = load i8*, i8** %_M_p.i.i.i.i1533, align 8, !tbaa !107 - %cmp.i.i.i1535 = icmp eq i8* %478, %467 - br i1 %cmp.i.i.i1535, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537, label %if.then.i.i1536 - -if.then.i.i1536: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532 - call void @_ZdlPv(i8* %478) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537: ; preds = %if.then.i.i1536, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1532 - call void @llvm.lifetime.end(i64 32, i8* nonnull %464) #7 - %_M_p.i.i1538 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 - %479 = load i8*, i8** %_M_p.i.i1538, align 8, !tbaa !107 - %call107 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %479, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %471, %union.anon** %472, align 8, !tbaa !109, !alias.scope !212 + %_M_p.i.i23.i.i1515 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 0, i32 0 + %473 = load i8*, i8** %_M_p.i.i23.i.i1515, align 8, !tbaa !113 + %474 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 2 + %arraydecay.i.i.i.i1516 = bitcast %union.anon* %474 to i8* + %cmp.i.i.i1517 = icmp eq i8* %473, %arraydecay.i.i.i.i1516 + br i1 %cmp.i.i.i1517, label %if.then.i.i1519, label %if.else.i.i1523 + +if.then.i.i1519: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488 + %arraydecay.i.i.i1518 = bitcast %union.anon* %471 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1518, i8* %473, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526 + +if.else.i.i1523: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1488 + %_M_p.i21.i.i1520 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 + store i8* %473, i8** %_M_p.i21.i.i1520, align 8, !tbaa !113, !alias.scope !212 + %_M_allocated_capacity.i.i1521 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 2, i32 0 + %475 = load i64, i64* %_M_allocated_capacity.i.i1521, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1522 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 2, i32 0 + store i64 %475, i64* %_M_allocated_capacity.i.i.i1522, align 8, !tbaa !66, !alias.scope !212 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526: ; preds = %if.else.i.i1523, %if.then.i.i1519 + %_M_string_length.i20.i.i1524 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1514, i64 0, i32 1 + %476 = load i64, i64* %_M_string_length.i20.i.i1524, align 8, !tbaa !110 + %_M_string_length.i.i2.i1525 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 1 + store i64 %476, i64* %_M_string_length.i.i2.i1525, align 8, !tbaa !110, !alias.scope !212 + %477 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1514 to %union.anon** + store %union.anon* %474, %union.anon** %477, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1524, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1516, align 1, !tbaa !93 + %_M_p.i.i.i.i1527 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp104, i64 0, i32 0, i32 0 + %478 = load i8*, i8** %_M_p.i.i.i.i1527, align 8, !tbaa !113 + %cmp.i.i.i1529 = icmp eq i8* %478, %467 + br i1 %cmp.i.i.i1529, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531, label %if.then.i.i1530 + +if.then.i.i1530: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526 + call void @_ZdlPv(i8* %478) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531: ; preds = %if.then.i.i1530, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1526 + call void @llvm.lifetime.end(i64 32, i8* nonnull %464) #2 + %_M_p.i.i1532 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %conv2d_13_b_path, i64 0, i32 0, i32 0 + %479 = load i8*, i8** %_M_p.i.i1532, align 8, !tbaa !113 + %call107 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %479, i32 0, i64 1, i64 512, i64 1, i64 1) %480 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %480) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %480) #2 %481 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp108 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %481) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %481) #2 %482 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 2 %483 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp108 to %union.anon** - store %union.anon* %482, %union.anon** %483, align 8, !tbaa !103 + store %union.anon* %482, %union.anon** %483, align 8, !tbaa !109 %484 = bitcast %union.anon* %482 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %484, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.52, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1568 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1568, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %484, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.70, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1577 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1577, align 8, !tbaa !110 %485 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %485, align 1, !tbaa !87 - %486 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !197 - %487 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !197 - %call3.i.i.i1573 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp108, i64 0, i64 0, i8* %487, i64 %486) #7, !noalias !197 + store i8 0, i8* %485, align 1, !tbaa !93 + %486 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !215 + %487 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !215 + %call3.i.i.i1582 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp108, i64 0, i64 0, i8* %487, i64 %486) #2, !noalias !215 %488 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 2 %489 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_w_path to %union.anon** - store %union.anon* %488, %union.anon** %489, align 8, !tbaa !103, !alias.scope !197 - %_M_p.i.i23.i.i1574 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 0, i32 0 - %490 = load i8*, i8** %_M_p.i.i23.i.i1574, align 8, !tbaa !107 - %491 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 2 - %arraydecay.i.i.i.i1575 = bitcast %union.anon* %491 to i8* - %cmp.i.i.i1576 = icmp eq i8* %490, %arraydecay.i.i.i.i1575 - br i1 %cmp.i.i.i1576, label %if.then.i.i1578, label %if.else.i.i1582 - -if.then.i.i1578: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537 - %arraydecay.i.i.i1577 = bitcast %union.anon* %488 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1577, i8* %490, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585 - -if.else.i.i1582: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1537 - %_M_p.i21.i.i1579 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 - store i8* %490, i8** %_M_p.i21.i.i1579, align 8, !tbaa !107, !alias.scope !197 - %_M_allocated_capacity.i.i1580 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 2, i32 0 - %492 = load i64, i64* %_M_allocated_capacity.i.i1580, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1581 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 2, i32 0 - store i64 %492, i64* %_M_allocated_capacity.i.i.i1581, align 8, !tbaa !63, !alias.scope !197 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585: ; preds = %if.else.i.i1582, %if.then.i.i1578 - %_M_string_length.i20.i.i1583 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1573, i64 0, i32 1 - %493 = load i64, i64* %_M_string_length.i20.i.i1583, align 8, !tbaa !104 - %_M_string_length.i.i2.i1584 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 1 - store i64 %493, i64* %_M_string_length.i.i2.i1584, align 8, !tbaa !104, !alias.scope !197 - %494 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1573 to %union.anon** - store %union.anon* %491, %union.anon** %494, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1583, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1575, align 1, !tbaa !87 - %_M_p.i.i.i.i1586 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 0, i32 0 - %495 = load i8*, i8** %_M_p.i.i.i.i1586, align 8, !tbaa !107 - %cmp.i.i.i1588 = icmp eq i8* %495, %484 - br i1 %cmp.i.i.i1588, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590, label %if.then.i.i1589 - -if.then.i.i1589: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585 - call void @_ZdlPv(i8* %495) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590: ; preds = %if.then.i.i1589, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1585 - call void @llvm.lifetime.end(i64 32, i8* nonnull %481) #7 - %_M_p.i.i1591 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 - %496 = load i8*, i8** %_M_p.i.i1591, align 8, !tbaa !107 - %call111 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %496, i32 0, i32 1, i32 1, i32 512, i32 512) + store %union.anon* %488, %union.anon** %489, align 8, !tbaa !109, !alias.scope !215 + %_M_p.i.i23.i.i1583 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 0, i32 0 + %490 = load i8*, i8** %_M_p.i.i23.i.i1583, align 8, !tbaa !113 + %491 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 2 + %arraydecay.i.i.i.i1584 = bitcast %union.anon* %491 to i8* + %cmp.i.i.i1585 = icmp eq i8* %490, %arraydecay.i.i.i.i1584 + br i1 %cmp.i.i.i1585, label %if.then.i.i1587, label %if.else.i.i1591 + +if.then.i.i1587: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531 + %arraydecay.i.i.i1586 = bitcast %union.anon* %488 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1586, i8* %490, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594 + +if.else.i.i1591: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1531 + %_M_p.i21.i.i1588 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 + store i8* %490, i8** %_M_p.i21.i.i1588, align 8, !tbaa !113, !alias.scope !215 + %_M_allocated_capacity.i.i1589 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 2, i32 0 + %492 = load i64, i64* %_M_allocated_capacity.i.i1589, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1590 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 2, i32 0 + store i64 %492, i64* %_M_allocated_capacity.i.i.i1590, align 8, !tbaa !66, !alias.scope !215 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594: ; preds = %if.else.i.i1591, %if.then.i.i1587 + %_M_string_length.i20.i.i1592 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1582, i64 0, i32 1 + %493 = load i64, i64* %_M_string_length.i20.i.i1592, align 8, !tbaa !110 + %_M_string_length.i.i2.i1593 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 1 + store i64 %493, i64* %_M_string_length.i.i2.i1593, align 8, !tbaa !110, !alias.scope !215 + %494 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1582 to %union.anon** + store %union.anon* %491, %union.anon** %494, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1592, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1584, align 1, !tbaa !93 + %_M_p.i.i.i.i1595 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp108, i64 0, i32 0, i32 0 + %495 = load i8*, i8** %_M_p.i.i.i.i1595, align 8, !tbaa !113 + %cmp.i.i.i1597 = icmp eq i8* %495, %484 + br i1 %cmp.i.i.i1597, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599, label %if.then.i.i1598 + +if.then.i.i1598: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594 + call void @_ZdlPv(i8* %495) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599: ; preds = %if.then.i.i1598, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1594 + call void @llvm.lifetime.end(i64 32, i8* nonnull %481) #2 + %_M_p.i.i1600 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_w_path, i64 0, i32 0, i32 0 + %496 = load i8*, i8** %_M_p.i.i1600, align 8, !tbaa !113 + %call111 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %496, i32 0, i64 1, i64 1, i64 512, i64 512) %497 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %497) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %497) #2 %498 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp112 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %498) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %498) #2 %499 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 2 %500 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp112 to %union.anon** - store %union.anon* %499, %union.anon** %500, align 8, !tbaa !103 + store %union.anon* %499, %union.anon** %500, align 8, !tbaa !109 %501 = bitcast %union.anon* %499 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %501, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.53, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1552 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1552, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %501, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.71, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1561 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1561, align 8, !tbaa !110 %502 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %502, align 1, !tbaa !87 - %503 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !200 - %504 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !200 - %call3.i.i.i1489 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp112, i64 0, i64 0, i8* %504, i64 %503) #7, !noalias !200 + store i8 0, i8* %502, align 1, !tbaa !93 + %503 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !218 + %504 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !218 + %call3.i.i.i1535 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp112, i64 0, i64 0, i8* %504, i64 %503) #2, !noalias !218 %505 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 2 %506 = bitcast %"class.std::__cxx11::basic_string"* %dense_1_b_path to %union.anon** - store %union.anon* %505, %union.anon** %506, align 8, !tbaa !103, !alias.scope !200 - %_M_p.i.i23.i.i1490 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 0, i32 0 - %507 = load i8*, i8** %_M_p.i.i23.i.i1490, align 8, !tbaa !107 - %508 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 2 - %arraydecay.i.i.i.i1491 = bitcast %union.anon* %508 to i8* - %cmp.i.i.i1492 = icmp eq i8* %507, %arraydecay.i.i.i.i1491 - br i1 %cmp.i.i.i1492, label %if.then.i.i1494, label %if.else.i.i1498 - -if.then.i.i1494: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590 - %arraydecay.i.i.i1493 = bitcast %union.anon* %505 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1493, i8* %507, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501 - -if.else.i.i1498: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1590 - %_M_p.i21.i.i1495 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 - store i8* %507, i8** %_M_p.i21.i.i1495, align 8, !tbaa !107, !alias.scope !200 - %_M_allocated_capacity.i.i1496 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 2, i32 0 - %509 = load i64, i64* %_M_allocated_capacity.i.i1496, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1497 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 2, i32 0 - store i64 %509, i64* %_M_allocated_capacity.i.i.i1497, align 8, !tbaa !63, !alias.scope !200 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501: ; preds = %if.else.i.i1498, %if.then.i.i1494 - %_M_string_length.i20.i.i1499 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1489, i64 0, i32 1 - %510 = load i64, i64* %_M_string_length.i20.i.i1499, align 8, !tbaa !104 - %_M_string_length.i.i2.i1500 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 1 - store i64 %510, i64* %_M_string_length.i.i2.i1500, align 8, !tbaa !104, !alias.scope !200 - %511 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1489 to %union.anon** - store %union.anon* %508, %union.anon** %511, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1499, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1491, align 1, !tbaa !87 - %_M_p.i.i.i.i1482 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 0, i32 0 - %512 = load i8*, i8** %_M_p.i.i.i.i1482, align 8, !tbaa !107 - %cmp.i.i.i1484 = icmp eq i8* %512, %501 - br i1 %cmp.i.i.i1484, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486, label %if.then.i.i1485 - -if.then.i.i1485: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501 - call void @_ZdlPv(i8* %512) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486: ; preds = %if.then.i.i1485, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1501 - call void @llvm.lifetime.end(i64 32, i8* nonnull %498) #7 - %_M_p.i.i1481 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 - %513 = load i8*, i8** %_M_p.i.i1481, align 8, !tbaa !107 - %call115 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %513, i32 0, i32 1, i32 512, i32 1, i32 1) + store %union.anon* %505, %union.anon** %506, align 8, !tbaa !109, !alias.scope !218 + %_M_p.i.i23.i.i1536 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 0, i32 0 + %507 = load i8*, i8** %_M_p.i.i23.i.i1536, align 8, !tbaa !113 + %508 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 2 + %arraydecay.i.i.i.i1537 = bitcast %union.anon* %508 to i8* + %cmp.i.i.i1538 = icmp eq i8* %507, %arraydecay.i.i.i.i1537 + br i1 %cmp.i.i.i1538, label %if.then.i.i1540, label %if.else.i.i1544 + +if.then.i.i1540: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599 + %arraydecay.i.i.i1539 = bitcast %union.anon* %505 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1539, i8* %507, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547 + +if.else.i.i1544: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1599 + %_M_p.i21.i.i1541 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 + store i8* %507, i8** %_M_p.i21.i.i1541, align 8, !tbaa !113, !alias.scope !218 + %_M_allocated_capacity.i.i1542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 2, i32 0 + %509 = load i64, i64* %_M_allocated_capacity.i.i1542, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1543 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 2, i32 0 + store i64 %509, i64* %_M_allocated_capacity.i.i.i1543, align 8, !tbaa !66, !alias.scope !218 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547: ; preds = %if.else.i.i1544, %if.then.i.i1540 + %_M_string_length.i20.i.i1545 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1535, i64 0, i32 1 + %510 = load i64, i64* %_M_string_length.i20.i.i1545, align 8, !tbaa !110 + %_M_string_length.i.i2.i1546 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 1 + store i64 %510, i64* %_M_string_length.i.i2.i1546, align 8, !tbaa !110, !alias.scope !218 + %511 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1535 to %union.anon** + store %union.anon* %508, %union.anon** %511, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1545, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1537, align 1, !tbaa !93 + %_M_p.i.i.i.i1491 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp112, i64 0, i32 0, i32 0 + %512 = load i8*, i8** %_M_p.i.i.i.i1491, align 8, !tbaa !113 + %cmp.i.i.i1493 = icmp eq i8* %512, %501 + br i1 %cmp.i.i.i1493, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495, label %if.then.i.i1494 + +if.then.i.i1494: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547 + call void @_ZdlPv(i8* %512) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495: ; preds = %if.then.i.i1494, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1547 + call void @llvm.lifetime.end(i64 32, i8* nonnull %498) #2 + %_M_p.i.i1490 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_1_b_path, i64 0, i32 0, i32 0 + %513 = load i8*, i8** %_M_p.i.i1490, align 8, !tbaa !113 + %call115 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %513, i32 0, i64 1, i64 512, i64 1, i64 1) %514 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_w_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %514) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %514) #2 %515 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp116 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %515) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %515) #2 %516 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 2 %517 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp116 to %union.anon** - store %union.anon* %516, %union.anon** %517, align 8, !tbaa !103 + store %union.anon* %516, %union.anon** %517, align 8, !tbaa !109 %518 = bitcast %union.anon* %516 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %518, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.54, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1404 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1404, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %518, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.72, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1450 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1450, align 8, !tbaa !110 %519 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %519, align 1, !tbaa !87 - %520 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !203 - %521 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !203 - %call3.i.i.i1378 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp116, i64 0, i64 0, i8* %521, i64 %520) #7, !noalias !203 + store i8 0, i8* %519, align 1, !tbaa !93 + %520 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !221 + %521 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !221 + %call3.i.i.i1387 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp116, i64 0, i64 0, i8* %521, i64 %520) #2, !noalias !221 %522 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 2 %523 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_w_path to %union.anon** - store %union.anon* %522, %union.anon** %523, align 8, !tbaa !103, !alias.scope !203 - %_M_p.i.i23.i.i1379 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 0, i32 0 - %524 = load i8*, i8** %_M_p.i.i23.i.i1379, align 8, !tbaa !107 - %525 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 2 - %arraydecay.i.i.i.i1380 = bitcast %union.anon* %525 to i8* - %cmp.i.i.i1381 = icmp eq i8* %524, %arraydecay.i.i.i.i1380 - br i1 %cmp.i.i.i1381, label %if.then.i.i1383, label %if.else.i.i1387 - -if.then.i.i1383: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486 - %arraydecay.i.i.i1382 = bitcast %union.anon* %522 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1382, i8* %524, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390 - -if.else.i.i1387: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1486 - %_M_p.i21.i.i1384 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 - store i8* %524, i8** %_M_p.i21.i.i1384, align 8, !tbaa !107, !alias.scope !203 - %_M_allocated_capacity.i.i1385 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 2, i32 0 - %526 = load i64, i64* %_M_allocated_capacity.i.i1385, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1386 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 2, i32 0 - store i64 %526, i64* %_M_allocated_capacity.i.i.i1386, align 8, !tbaa !63, !alias.scope !203 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390: ; preds = %if.else.i.i1387, %if.then.i.i1383 - %_M_string_length.i20.i.i1388 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1378, i64 0, i32 1 - %527 = load i64, i64* %_M_string_length.i20.i.i1388, align 8, !tbaa !104 - %_M_string_length.i.i2.i1389 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 1 - store i64 %527, i64* %_M_string_length.i.i2.i1389, align 8, !tbaa !104, !alias.scope !203 - %528 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1378 to %union.anon** - store %union.anon* %525, %union.anon** %528, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1388, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1380, align 1, !tbaa !87 - %_M_p.i.i.i.i1334 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 0, i32 0 - %529 = load i8*, i8** %_M_p.i.i.i.i1334, align 8, !tbaa !107 - %cmp.i.i.i1336 = icmp eq i8* %529, %518 - br i1 %cmp.i.i.i1336, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338, label %if.then.i.i1337 - -if.then.i.i1337: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390 - call void @_ZdlPv(i8* %529) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338: ; preds = %if.then.i.i1337, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1390 - call void @llvm.lifetime.end(i64 32, i8* nonnull %515) #7 - %_M_p.i.i1333 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 - %530 = load i8*, i8** %_M_p.i.i1333, align 8, !tbaa !107 - %call119 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %530, i32 0, i32 1, i32 1, i32 512, i32 10) + store %union.anon* %522, %union.anon** %523, align 8, !tbaa !109, !alias.scope !221 + %_M_p.i.i23.i.i1388 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 0, i32 0 + %524 = load i8*, i8** %_M_p.i.i23.i.i1388, align 8, !tbaa !113 + %525 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 2 + %arraydecay.i.i.i.i1389 = bitcast %union.anon* %525 to i8* + %cmp.i.i.i1390 = icmp eq i8* %524, %arraydecay.i.i.i.i1389 + br i1 %cmp.i.i.i1390, label %if.then.i.i1392, label %if.else.i.i1396 + +if.then.i.i1392: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495 + %arraydecay.i.i.i1391 = bitcast %union.anon* %522 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1391, i8* %524, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399 + +if.else.i.i1396: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1495 + %_M_p.i21.i.i1393 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 + store i8* %524, i8** %_M_p.i21.i.i1393, align 8, !tbaa !113, !alias.scope !221 + %_M_allocated_capacity.i.i1394 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 2, i32 0 + %526 = load i64, i64* %_M_allocated_capacity.i.i1394, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1395 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 2, i32 0 + store i64 %526, i64* %_M_allocated_capacity.i.i.i1395, align 8, !tbaa !66, !alias.scope !221 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399: ; preds = %if.else.i.i1396, %if.then.i.i1392 + %_M_string_length.i20.i.i1397 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1387, i64 0, i32 1 + %527 = load i64, i64* %_M_string_length.i20.i.i1397, align 8, !tbaa !110 + %_M_string_length.i.i2.i1398 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 1 + store i64 %527, i64* %_M_string_length.i.i2.i1398, align 8, !tbaa !110, !alias.scope !221 + %528 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1387 to %union.anon** + store %union.anon* %525, %union.anon** %528, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1397, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1389, align 1, !tbaa !93 + %_M_p.i.i.i.i1380 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp116, i64 0, i32 0, i32 0 + %529 = load i8*, i8** %_M_p.i.i.i.i1380, align 8, !tbaa !113 + %cmp.i.i.i1382 = icmp eq i8* %529, %518 + br i1 %cmp.i.i.i1382, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384, label %if.then.i.i1383 + +if.then.i.i1383: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399 + call void @_ZdlPv(i8* %529) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384: ; preds = %if.then.i.i1383, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1399 + call void @llvm.lifetime.end(i64 32, i8* nonnull %515) #2 + %_M_p.i.i1379 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_w_path, i64 0, i32 0, i32 0 + %530 = load i8*, i8** %_M_p.i.i1379, align 8, !tbaa !113 + %call119 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %530, i32 0, i64 1, i64 1, i64 512, i64 10) %531 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_b_path to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %531) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %531) #2 %532 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp120 to i8* - call void @llvm.lifetime.start(i64 32, i8* nonnull %532) #7 + call void @llvm.lifetime.start(i64 32, i8* nonnull %532) #2 %533 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 2 %534 = bitcast %"class.std::__cxx11::basic_string"* %ref.tmp120 to %union.anon** - store %union.anon* %533, %union.anon** %534, align 8, !tbaa !103 + store %union.anon* %533, %union.anon** %534, align 8, !tbaa !109 %535 = bitcast %union.anon* %533 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %535, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.55, i64 0, i64 0), i64 13, i32 1, i1 false) #7 - %_M_string_length.i.i.i.i.i.i1293 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 1 - store i64 13, i64* %_M_string_length.i.i.i.i.i.i1293, align 8, !tbaa !104 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %535, i8* nonnull getelementptr inbounds ([14 x i8], [14 x i8]* @.str.73, i64 0, i64 0), i64 13, i32 1, i1 false) #2 + %_M_string_length.i.i.i.i.i.i1302 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 1 + store i64 13, i64* %_M_string_length.i.i.i.i.i.i1302, align 8, !tbaa !110 %536 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 2, i32 1, i64 5 - store i8 0, i8* %536, align 1, !tbaa !87 - %537 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !104, !noalias !206 - %538 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107, !noalias !206 - %call3.i.i.i1230 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp120, i64 0, i64 0, i8* %538, i64 %537) #7, !noalias !206 + store i8 0, i8* %536, align 1, !tbaa !93 + %537 = load i64, i64* %_M_string_length.i.i.i.i.i.i, align 8, !tbaa !110, !noalias !224 + %538 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113, !noalias !224 + %call3.i.i.i1276 = call dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"* nonnull %ref.tmp120, i64 0, i64 0, i8* %538, i64 %537) #2, !noalias !224 %539 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 2 %540 = bitcast %"class.std::__cxx11::basic_string"* %dense_2_b_path to %union.anon** - store %union.anon* %539, %union.anon** %540, align 8, !tbaa !103, !alias.scope !206 - %_M_p.i.i23.i.i1231 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 0, i32 0 - %541 = load i8*, i8** %_M_p.i.i23.i.i1231, align 8, !tbaa !107 - %542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 2 - %arraydecay.i.i.i.i1232 = bitcast %union.anon* %542 to i8* - %cmp.i.i.i1233 = icmp eq i8* %541, %arraydecay.i.i.i.i1232 - br i1 %cmp.i.i.i1233, label %if.then.i.i1235, label %if.else.i.i1239 - -if.then.i.i1235: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338 - %arraydecay.i.i.i1234 = bitcast %union.anon* %539 to i8* - call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1234, i8* %541, i64 16, i32 1, i1 false) #7 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242 - -if.else.i.i1239: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1338 - %_M_p.i21.i.i1236 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 - store i8* %541, i8** %_M_p.i21.i.i1236, align 8, !tbaa !107, !alias.scope !206 - %_M_allocated_capacity.i.i1237 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 2, i32 0 - %543 = load i64, i64* %_M_allocated_capacity.i.i1237, align 8, !tbaa !63 - %_M_allocated_capacity.i.i.i1238 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 2, i32 0 - store i64 %543, i64* %_M_allocated_capacity.i.i.i1238, align 8, !tbaa !63, !alias.scope !206 - br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242 - -_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242: ; preds = %if.else.i.i1239, %if.then.i.i1235 - %_M_string_length.i20.i.i1240 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1230, i64 0, i32 1 - %544 = load i64, i64* %_M_string_length.i20.i.i1240, align 8, !tbaa !104 - %_M_string_length.i.i2.i1241 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 1 - store i64 %544, i64* %_M_string_length.i.i2.i1241, align 8, !tbaa !104, !alias.scope !206 - %545 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1230 to %union.anon** - store %union.anon* %542, %union.anon** %545, align 8, !tbaa !107 - store i64 0, i64* %_M_string_length.i20.i.i1240, align 8, !tbaa !104 - store i8 0, i8* %arraydecay.i.i.i.i1232, align 1, !tbaa !87 - %_M_p.i.i.i.i1223 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 0, i32 0 - %546 = load i8*, i8** %_M_p.i.i.i.i1223, align 8, !tbaa !107 - %cmp.i.i.i1225 = icmp eq i8* %546, %535 - br i1 %cmp.i.i.i1225, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227, label %if.then.i.i1226 - -if.then.i.i1226: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242 - call void @_ZdlPv(i8* %546) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227: ; preds = %if.then.i.i1226, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1242 - call void @llvm.lifetime.end(i64 32, i8* nonnull %532) #7 - %_M_p.i.i1222 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 - %547 = load i8*, i8** %_M_p.i.i1222, align 8, !tbaa !107 - %call123 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %547, i32 0, i32 1, i32 10, i32 1, i32 1) - %_M_p.i.i1184 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 0, i32 0 - %548 = load i8*, i8** %_M_p.i.i1184, align 8, !tbaa !107 - %call125 = call %struct.Tensor* @_Z18readTrainedWeightsPKciiiii(i8* %548, i32 0, i32 2000, i32 3, i32 32, i32 32) - %_M_p.i.i1183 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 - %549 = load i8*, i8** %_M_p.i.i1183, align 8, !tbaa !107 - %call.i = call noalias i8* @malloc(i64 2000) #7 - %call1.i = call %struct._IO_FILE* @fopen(i8* %549, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.15, i64 0, i64 0)) #7 + store %union.anon* %539, %union.anon** %540, align 8, !tbaa !109, !alias.scope !224 + %_M_p.i.i23.i.i1277 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 0, i32 0 + %541 = load i8*, i8** %_M_p.i.i23.i.i1277, align 8, !tbaa !113 + %542 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 2 + %arraydecay.i.i.i.i1278 = bitcast %union.anon* %542 to i8* + %cmp.i.i.i1279 = icmp eq i8* %541, %arraydecay.i.i.i.i1278 + br i1 %cmp.i.i.i1279, label %if.then.i.i1281, label %if.else.i.i1285 + +if.then.i.i1281: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384 + %arraydecay.i.i.i1280 = bitcast %union.anon* %539 to i8* + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arraydecay.i.i.i1280, i8* %541, i64 16, i32 1, i1 false) #2 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288 + +if.else.i.i1285: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1384 + %_M_p.i21.i.i1282 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 + store i8* %541, i8** %_M_p.i21.i.i1282, align 8, !tbaa !113, !alias.scope !224 + %_M_allocated_capacity.i.i1283 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 2, i32 0 + %543 = load i64, i64* %_M_allocated_capacity.i.i1283, align 8, !tbaa !66 + %_M_allocated_capacity.i.i.i1284 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 2, i32 0 + store i64 %543, i64* %_M_allocated_capacity.i.i.i1284, align 8, !tbaa !66, !alias.scope !224 + br label %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288 + +_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288: ; preds = %if.else.i.i1285, %if.then.i.i1281 + %_M_string_length.i20.i.i1286 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %call3.i.i.i1276, i64 0, i32 1 + %544 = load i64, i64* %_M_string_length.i20.i.i1286, align 8, !tbaa !110 + %_M_string_length.i.i2.i1287 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 1 + store i64 %544, i64* %_M_string_length.i.i2.i1287, align 8, !tbaa !110, !alias.scope !224 + %545 = bitcast %"class.std::__cxx11::basic_string"* %call3.i.i.i1276 to %union.anon** + store %union.anon* %542, %union.anon** %545, align 8, !tbaa !113 + store i64 0, i64* %_M_string_length.i20.i.i1286, align 8, !tbaa !110 + store i8 0, i8* %arraydecay.i.i.i.i1278, align 1, !tbaa !93 + %_M_p.i.i.i.i1232 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %ref.tmp120, i64 0, i32 0, i32 0 + %546 = load i8*, i8** %_M_p.i.i.i.i1232, align 8, !tbaa !113 + %cmp.i.i.i1234 = icmp eq i8* %546, %535 + br i1 %cmp.i.i.i1234, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236, label %if.then.i.i1235 + +if.then.i.i1235: ; preds = %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288 + call void @_ZdlPv(i8* %546) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236: ; preds = %if.then.i.i1235, %_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_.exit1288 + call void @llvm.lifetime.end(i64 32, i8* nonnull %532) #2 + %_M_p.i.i1231 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %dense_2_b_path, i64 0, i32 0, i32 0 + %547 = load i8*, i8** %_M_p.i.i1231, align 8, !tbaa !113 + %call123 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %547, i32 0, i64 1, i64 10, i64 1, i64 1) + %_M_p.i.i1230 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %input_path, i64 0, i32 0, i32 0 + %548 = load i8*, i8** %_M_p.i.i1230, align 8, !tbaa !113 + %call125 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %548, i32 0, i64 2000, i64 3, i64 32, i64 32) + %_M_p.i.i1192 = getelementptr inbounds %"class.std::__cxx11::basic_string", %"class.std::__cxx11::basic_string"* %labels_path, i64 0, i32 0, i32 0 + %549 = load i8*, i8** %_M_p.i.i1192, align 8, !tbaa !113 + %call.i = call noalias i8* @malloc(i64 8000) #2 + %call1.i = call %struct._IO_FILE* @fopen(i8* %549, i8* getelementptr inbounds ([3 x i8], [3 x i8]* @.str.14, i64 0, i64 0)) #2 %cmp.i = icmp eq %struct._IO_FILE* %call1.i, null - br i1 %cmp.i, label %if.then.i, label %_Z10readLabelsPKci.exit + br i1 %cmp.i, label %if.then.i, label %_Z11readLabels3PKci.exit -if.then.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227 - %call2.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.18, i64 0, i64 0), i8* %549) #7 - call void @abort() #8 +if.then.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236 + %call2.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([40 x i8], [40 x i8]* @.str.20, i64 0, i64 0), i8* %549) #2 + call void @abort() #13 unreachable -_Z10readLabelsPKci.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1227 - %call5.i = call i64 @fread(i8* %call.i, i64 1, i64 2000, %struct._IO_FILE* nonnull %call1.i) #7 - %call6.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.19, i64 0, i64 0), i64 %call5.i) #7 +_Z11readLabels3PKci.exit: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1236 + %550 = bitcast i8* %call.i to i32* + %call5.i = call i64 @fread(i8* %call.i, i64 1, i64 8000, %struct._IO_FILE* nonnull %call1.i) #2 + %call6.i = call i32 @fclose(%struct._IO_FILE* nonnull %call1.i) #2 call void @llvm.visc.init() - %call128 = call noalias i8* @malloc(i64 512) #7 + %call128 = call noalias i8* @malloc(i64 512) #2 %input129 = bitcast i8* %call128 to i8** - %550 = bitcast i8* %call128 to %struct.Tensor** - store %struct.Tensor* %call125, %struct.Tensor** %550, align 1, !tbaa !209 + %551 = bitcast i8* %call128 to %struct.Tensor** + store %struct.Tensor* %call125, %struct.Tensor** %551, align 1, !tbaa !227 %input_bytes = getelementptr inbounds i8, i8* %call128, i64 8 - %551 = bitcast i8* %input_bytes to i64* - store i64 0, i64* %551, align 1, !tbaa !212 + %552 = bitcast i8* %input_bytes to i64* + store i64 0, i64* %552, align 1, !tbaa !230 %conv2d_1_w130 = getelementptr inbounds i8, i8* %call128, i64 16 - %552 = bitcast i8* %conv2d_1_w130 to %struct.Tensor** - store %struct.Tensor* %call7, %struct.Tensor** %552, align 1, !tbaa !213 + %553 = bitcast i8* %conv2d_1_w130 to %struct.Tensor** + store %struct.Tensor* %call7, %struct.Tensor** %553, align 1, !tbaa !231 %conv2d_1_w_bytes = getelementptr inbounds i8, i8* %call128, i64 24 - %553 = bitcast i8* %conv2d_1_w_bytes to i64* - store i64 0, i64* %553, align 1, !tbaa !214 + %554 = bitcast i8* %conv2d_1_w_bytes to i64* + store i64 0, i64* %554, align 1, !tbaa !232 %conv2d_1_b131 = getelementptr inbounds i8, i8* %call128, i64 32 - %554 = bitcast i8* %conv2d_1_b131 to %struct.Tensor** - store %struct.Tensor* %call11, %struct.Tensor** %554, align 1, !tbaa !215 + %555 = bitcast i8* %conv2d_1_b131 to %struct.Tensor** + store %struct.Tensor* %call11, %struct.Tensor** %555, align 1, !tbaa !233 %conv2d_1_b_bytes = getelementptr inbounds i8, i8* %call128, i64 40 - %555 = bitcast i8* %conv2d_1_b_bytes to i64* - store i64 0, i64* %555, align 1, !tbaa !216 + %556 = bitcast i8* %conv2d_1_b_bytes to i64* + store i64 0, i64* %556, align 1, !tbaa !234 %conv2d_2_w132 = getelementptr inbounds i8, i8* %call128, i64 48 - %556 = bitcast i8* %conv2d_2_w132 to %struct.Tensor** - store %struct.Tensor* %call15, %struct.Tensor** %556, align 1, !tbaa !217 + %557 = bitcast i8* %conv2d_2_w132 to %struct.Tensor** + store %struct.Tensor* %call15, %struct.Tensor** %557, align 1, !tbaa !235 %conv2d_2_w_bytes = getelementptr inbounds i8, i8* %call128, i64 56 - %557 = bitcast i8* %conv2d_2_w_bytes to i64* - store i64 0, i64* %557, align 1, !tbaa !218 + %558 = bitcast i8* %conv2d_2_w_bytes to i64* + store i64 0, i64* %558, align 1, !tbaa !236 %conv2d_2_b133 = getelementptr inbounds i8, i8* %call128, i64 64 - %558 = bitcast i8* %conv2d_2_b133 to %struct.Tensor** - store %struct.Tensor* %call19, %struct.Tensor** %558, align 1, !tbaa !219 + %559 = bitcast i8* %conv2d_2_b133 to %struct.Tensor** + store %struct.Tensor* %call19, %struct.Tensor** %559, align 1, !tbaa !237 %conv2d_2_b_bytes = getelementptr inbounds i8, i8* %call128, i64 72 - %559 = bitcast i8* %conv2d_2_b_bytes to i64* - store i64 0, i64* %559, align 1, !tbaa !220 + %560 = bitcast i8* %conv2d_2_b_bytes to i64* + store i64 0, i64* %560, align 1, !tbaa !238 %conv2d_3_w134 = getelementptr inbounds i8, i8* %call128, i64 80 - %560 = bitcast i8* %conv2d_3_w134 to %struct.Tensor** - store %struct.Tensor* %call23, %struct.Tensor** %560, align 1, !tbaa !221 + %561 = bitcast i8* %conv2d_3_w134 to %struct.Tensor** + store %struct.Tensor* %call23, %struct.Tensor** %561, align 1, !tbaa !239 %conv2d_3_w_bytes = getelementptr inbounds i8, i8* %call128, i64 88 - %561 = bitcast i8* %conv2d_3_w_bytes to i64* - store i64 0, i64* %561, align 1, !tbaa !222 + %562 = bitcast i8* %conv2d_3_w_bytes to i64* + store i64 0, i64* %562, align 1, !tbaa !240 %conv2d_3_b135 = getelementptr inbounds i8, i8* %call128, i64 96 - %562 = bitcast i8* %conv2d_3_b135 to %struct.Tensor** - store %struct.Tensor* %call27, %struct.Tensor** %562, align 1, !tbaa !223 + %563 = bitcast i8* %conv2d_3_b135 to %struct.Tensor** + store %struct.Tensor* %call27, %struct.Tensor** %563, align 1, !tbaa !241 %conv2d_3_b_bytes = getelementptr inbounds i8, i8* %call128, i64 104 - %563 = bitcast i8* %conv2d_3_b_bytes to i64* - store i64 0, i64* %563, align 1, !tbaa !224 + %564 = bitcast i8* %conv2d_3_b_bytes to i64* + store i64 0, i64* %564, align 1, !tbaa !242 %conv2d_4_w136 = getelementptr inbounds i8, i8* %call128, i64 112 - %564 = bitcast i8* %conv2d_4_w136 to %struct.Tensor** - store %struct.Tensor* %call31, %struct.Tensor** %564, align 1, !tbaa !225 + %565 = bitcast i8* %conv2d_4_w136 to %struct.Tensor** + store %struct.Tensor* %call31, %struct.Tensor** %565, align 1, !tbaa !243 %conv2d_4_w_bytes = getelementptr inbounds i8, i8* %call128, i64 120 - %565 = bitcast i8* %conv2d_4_w_bytes to i64* - store i64 0, i64* %565, align 1, !tbaa !226 + %566 = bitcast i8* %conv2d_4_w_bytes to i64* + store i64 0, i64* %566, align 1, !tbaa !244 %conv2d_4_b137 = getelementptr inbounds i8, i8* %call128, i64 128 - %566 = bitcast i8* %conv2d_4_b137 to %struct.Tensor** - store %struct.Tensor* %call35, %struct.Tensor** %566, align 1, !tbaa !227 + %567 = bitcast i8* %conv2d_4_b137 to %struct.Tensor** + store %struct.Tensor* %call35, %struct.Tensor** %567, align 1, !tbaa !245 %conv2d_4_b_bytes = getelementptr inbounds i8, i8* %call128, i64 136 - %567 = bitcast i8* %conv2d_4_b_bytes to i64* - store i64 0, i64* %567, align 1, !tbaa !228 + %568 = bitcast i8* %conv2d_4_b_bytes to i64* + store i64 0, i64* %568, align 1, !tbaa !246 %conv2d_5_w138 = getelementptr inbounds i8, i8* %call128, i64 144 - %568 = bitcast i8* %conv2d_5_w138 to %struct.Tensor** - store %struct.Tensor* %call39, %struct.Tensor** %568, align 1, !tbaa !229 + %569 = bitcast i8* %conv2d_5_w138 to %struct.Tensor** + store %struct.Tensor* %call39, %struct.Tensor** %569, align 1, !tbaa !247 %conv2d_5_w_bytes = getelementptr inbounds i8, i8* %call128, i64 152 - %569 = bitcast i8* %conv2d_5_w_bytes to i64* - store i64 0, i64* %569, align 1, !tbaa !230 + %570 = bitcast i8* %conv2d_5_w_bytes to i64* + store i64 0, i64* %570, align 1, !tbaa !248 %conv2d_5_b139 = getelementptr inbounds i8, i8* %call128, i64 160 - %570 = bitcast i8* %conv2d_5_b139 to %struct.Tensor** - store %struct.Tensor* %call43, %struct.Tensor** %570, align 1, !tbaa !231 + %571 = bitcast i8* %conv2d_5_b139 to %struct.Tensor** + store %struct.Tensor* %call43, %struct.Tensor** %571, align 1, !tbaa !249 %conv2d_5_b_bytes = getelementptr inbounds i8, i8* %call128, i64 168 - %571 = bitcast i8* %conv2d_5_b_bytes to i64* - store i64 0, i64* %571, align 1, !tbaa !232 + %572 = bitcast i8* %conv2d_5_b_bytes to i64* + store i64 0, i64* %572, align 1, !tbaa !250 %conv2d_6_w140 = getelementptr inbounds i8, i8* %call128, i64 176 - %572 = bitcast i8* %conv2d_6_w140 to %struct.Tensor** - store %struct.Tensor* %call47, %struct.Tensor** %572, align 1, !tbaa !233 + %573 = bitcast i8* %conv2d_6_w140 to %struct.Tensor** + store %struct.Tensor* %call47, %struct.Tensor** %573, align 1, !tbaa !251 %conv2d_6_w_bytes = getelementptr inbounds i8, i8* %call128, i64 184 - %573 = bitcast i8* %conv2d_6_w_bytes to i64* - store i64 0, i64* %573, align 1, !tbaa !234 + %574 = bitcast i8* %conv2d_6_w_bytes to i64* + store i64 0, i64* %574, align 1, !tbaa !252 %conv2d_6_b141 = getelementptr inbounds i8, i8* %call128, i64 192 - %574 = bitcast i8* %conv2d_6_b141 to %struct.Tensor** - store %struct.Tensor* %call51, %struct.Tensor** %574, align 1, !tbaa !235 + %575 = bitcast i8* %conv2d_6_b141 to %struct.Tensor** + store %struct.Tensor* %call51, %struct.Tensor** %575, align 1, !tbaa !253 %conv2d_6_b_bytes = getelementptr inbounds i8, i8* %call128, i64 200 - %575 = bitcast i8* %conv2d_6_b_bytes to i64* - store i64 0, i64* %575, align 1, !tbaa !236 + %576 = bitcast i8* %conv2d_6_b_bytes to i64* + store i64 0, i64* %576, align 1, !tbaa !254 %conv2d_7_w142 = getelementptr inbounds i8, i8* %call128, i64 208 - %576 = bitcast i8* %conv2d_7_w142 to %struct.Tensor** - store %struct.Tensor* %call55, %struct.Tensor** %576, align 1, !tbaa !237 + %577 = bitcast i8* %conv2d_7_w142 to %struct.Tensor** + store %struct.Tensor* %call55, %struct.Tensor** %577, align 1, !tbaa !255 %conv2d_7_w_bytes = getelementptr inbounds i8, i8* %call128, i64 216 - %577 = bitcast i8* %conv2d_7_w_bytes to i64* - store i64 0, i64* %577, align 1, !tbaa !238 + %578 = bitcast i8* %conv2d_7_w_bytes to i64* + store i64 0, i64* %578, align 1, !tbaa !256 %conv2d_7_b143 = getelementptr inbounds i8, i8* %call128, i64 224 - %578 = bitcast i8* %conv2d_7_b143 to %struct.Tensor** - store %struct.Tensor* %call59, %struct.Tensor** %578, align 1, !tbaa !239 + %579 = bitcast i8* %conv2d_7_b143 to %struct.Tensor** + store %struct.Tensor* %call59, %struct.Tensor** %579, align 1, !tbaa !257 %conv2d_7_b_bytes = getelementptr inbounds i8, i8* %call128, i64 232 - %579 = bitcast i8* %conv2d_7_b_bytes to i64* - store i64 0, i64* %579, align 1, !tbaa !240 + %580 = bitcast i8* %conv2d_7_b_bytes to i64* + store i64 0, i64* %580, align 1, !tbaa !258 %conv2d_8_w144 = getelementptr inbounds i8, i8* %call128, i64 240 - %580 = bitcast i8* %conv2d_8_w144 to %struct.Tensor** - store %struct.Tensor* %call63, %struct.Tensor** %580, align 1, !tbaa !241 + %581 = bitcast i8* %conv2d_8_w144 to %struct.Tensor** + store %struct.Tensor* %call63, %struct.Tensor** %581, align 1, !tbaa !259 %conv2d_8_w_bytes = getelementptr inbounds i8, i8* %call128, i64 248 - %581 = bitcast i8* %conv2d_8_w_bytes to i64* - store i64 0, i64* %581, align 1, !tbaa !242 + %582 = bitcast i8* %conv2d_8_w_bytes to i64* + store i64 0, i64* %582, align 1, !tbaa !260 %conv2d_8_b145 = getelementptr inbounds i8, i8* %call128, i64 256 - %582 = bitcast i8* %conv2d_8_b145 to %struct.Tensor** - store %struct.Tensor* %call67, %struct.Tensor** %582, align 1, !tbaa !243 + %583 = bitcast i8* %conv2d_8_b145 to %struct.Tensor** + store %struct.Tensor* %call67, %struct.Tensor** %583, align 1, !tbaa !261 %conv2d_8_b_bytes = getelementptr inbounds i8, i8* %call128, i64 264 - %583 = bitcast i8* %conv2d_8_b_bytes to i64* - store i64 0, i64* %583, align 1, !tbaa !244 + %584 = bitcast i8* %conv2d_8_b_bytes to i64* + store i64 0, i64* %584, align 1, !tbaa !262 %conv2d_9_w146 = getelementptr inbounds i8, i8* %call128, i64 272 - %584 = bitcast i8* %conv2d_9_w146 to %struct.Tensor** - store %struct.Tensor* %call71, %struct.Tensor** %584, align 1, !tbaa !245 + %585 = bitcast i8* %conv2d_9_w146 to %struct.Tensor** + store %struct.Tensor* %call71, %struct.Tensor** %585, align 1, !tbaa !263 %conv2d_9_w_bytes = getelementptr inbounds i8, i8* %call128, i64 280 - %585 = bitcast i8* %conv2d_9_w_bytes to i64* - store i64 0, i64* %585, align 1, !tbaa !246 + %586 = bitcast i8* %conv2d_9_w_bytes to i64* + store i64 0, i64* %586, align 1, !tbaa !264 %conv2d_9_b147 = getelementptr inbounds i8, i8* %call128, i64 288 - %586 = bitcast i8* %conv2d_9_b147 to %struct.Tensor** - store %struct.Tensor* %call75, %struct.Tensor** %586, align 1, !tbaa !247 + %587 = bitcast i8* %conv2d_9_b147 to %struct.Tensor** + store %struct.Tensor* %call75, %struct.Tensor** %587, align 1, !tbaa !265 %conv2d_9_b_bytes = getelementptr inbounds i8, i8* %call128, i64 296 - %587 = bitcast i8* %conv2d_9_b_bytes to i64* - store i64 0, i64* %587, align 1, !tbaa !248 + %588 = bitcast i8* %conv2d_9_b_bytes to i64* + store i64 0, i64* %588, align 1, !tbaa !266 %conv2d_10_w148 = getelementptr inbounds i8, i8* %call128, i64 304 - %588 = bitcast i8* %conv2d_10_w148 to %struct.Tensor** - store %struct.Tensor* %call79, %struct.Tensor** %588, align 1, !tbaa !249 + %589 = bitcast i8* %conv2d_10_w148 to %struct.Tensor** + store %struct.Tensor* %call79, %struct.Tensor** %589, align 1, !tbaa !267 %conv2d_10_w_bytes = getelementptr inbounds i8, i8* %call128, i64 312 - %589 = bitcast i8* %conv2d_10_w_bytes to i64* - store i64 0, i64* %589, align 1, !tbaa !250 + %590 = bitcast i8* %conv2d_10_w_bytes to i64* + store i64 0, i64* %590, align 1, !tbaa !268 %conv2d_10_b149 = getelementptr inbounds i8, i8* %call128, i64 320 - %590 = bitcast i8* %conv2d_10_b149 to %struct.Tensor** - store %struct.Tensor* %call83, %struct.Tensor** %590, align 1, !tbaa !251 + %591 = bitcast i8* %conv2d_10_b149 to %struct.Tensor** + store %struct.Tensor* %call83, %struct.Tensor** %591, align 1, !tbaa !269 %conv2d_10_b_bytes = getelementptr inbounds i8, i8* %call128, i64 328 - %591 = bitcast i8* %conv2d_10_b_bytes to i64* - store i64 0, i64* %591, align 1, !tbaa !252 + %592 = bitcast i8* %conv2d_10_b_bytes to i64* + store i64 0, i64* %592, align 1, !tbaa !270 %conv2d_11_w150 = getelementptr inbounds i8, i8* %call128, i64 336 - %592 = bitcast i8* %conv2d_11_w150 to %struct.Tensor** - store %struct.Tensor* %call87, %struct.Tensor** %592, align 1, !tbaa !253 + %593 = bitcast i8* %conv2d_11_w150 to %struct.Tensor** + store %struct.Tensor* %call87, %struct.Tensor** %593, align 1, !tbaa !271 %conv2d_11_w_bytes = getelementptr inbounds i8, i8* %call128, i64 344 - %593 = bitcast i8* %conv2d_11_w_bytes to i64* - store i64 0, i64* %593, align 1, !tbaa !254 + %594 = bitcast i8* %conv2d_11_w_bytes to i64* + store i64 0, i64* %594, align 1, !tbaa !272 %conv2d_11_b151 = getelementptr inbounds i8, i8* %call128, i64 352 - %594 = bitcast i8* %conv2d_11_b151 to %struct.Tensor** - store %struct.Tensor* %call91, %struct.Tensor** %594, align 1, !tbaa !255 + %595 = bitcast i8* %conv2d_11_b151 to %struct.Tensor** + store %struct.Tensor* %call91, %struct.Tensor** %595, align 1, !tbaa !273 %conv2d_11_b_bytes = getelementptr inbounds i8, i8* %call128, i64 360 - %595 = bitcast i8* %conv2d_11_b_bytes to i64* - store i64 0, i64* %595, align 1, !tbaa !256 + %596 = bitcast i8* %conv2d_11_b_bytes to i64* + store i64 0, i64* %596, align 1, !tbaa !274 %conv2d_12_w152 = getelementptr inbounds i8, i8* %call128, i64 368 - %596 = bitcast i8* %conv2d_12_w152 to %struct.Tensor** - store %struct.Tensor* %call95, %struct.Tensor** %596, align 1, !tbaa !257 + %597 = bitcast i8* %conv2d_12_w152 to %struct.Tensor** + store %struct.Tensor* %call95, %struct.Tensor** %597, align 1, !tbaa !275 %conv2d_12_w_bytes = getelementptr inbounds i8, i8* %call128, i64 376 - %597 = bitcast i8* %conv2d_12_w_bytes to i64* - store i64 0, i64* %597, align 1, !tbaa !258 + %598 = bitcast i8* %conv2d_12_w_bytes to i64* + store i64 0, i64* %598, align 1, !tbaa !276 %conv2d_12_b153 = getelementptr inbounds i8, i8* %call128, i64 384 - %598 = bitcast i8* %conv2d_12_b153 to %struct.Tensor** - store %struct.Tensor* %call99, %struct.Tensor** %598, align 1, !tbaa !259 + %599 = bitcast i8* %conv2d_12_b153 to %struct.Tensor** + store %struct.Tensor* %call99, %struct.Tensor** %599, align 1, !tbaa !277 %conv2d_12_b_bytes = getelementptr inbounds i8, i8* %call128, i64 392 - %599 = bitcast i8* %conv2d_12_b_bytes to i64* - store i64 0, i64* %599, align 1, !tbaa !260 + %600 = bitcast i8* %conv2d_12_b_bytes to i64* + store i64 0, i64* %600, align 1, !tbaa !278 %conv2d_13_w154 = getelementptr inbounds i8, i8* %call128, i64 400 - %600 = bitcast i8* %conv2d_13_w154 to %struct.Tensor** - store %struct.Tensor* %call103, %struct.Tensor** %600, align 1, !tbaa !261 + %601 = bitcast i8* %conv2d_13_w154 to %struct.Tensor** + store %struct.Tensor* %call103, %struct.Tensor** %601, align 1, !tbaa !279 %conv2d_13_w_bytes = getelementptr inbounds i8, i8* %call128, i64 408 - %601 = bitcast i8* %conv2d_13_w_bytes to i64* - store i64 0, i64* %601, align 1, !tbaa !262 + %602 = bitcast i8* %conv2d_13_w_bytes to i64* + store i64 0, i64* %602, align 1, !tbaa !280 %conv2d_13_b155 = getelementptr inbounds i8, i8* %call128, i64 416 - %602 = bitcast i8* %conv2d_13_b155 to %struct.Tensor** - store %struct.Tensor* %call107, %struct.Tensor** %602, align 1, !tbaa !263 + %603 = bitcast i8* %conv2d_13_b155 to %struct.Tensor** + store %struct.Tensor* %call107, %struct.Tensor** %603, align 1, !tbaa !281 %conv2d_13_b_bytes = getelementptr inbounds i8, i8* %call128, i64 424 - %603 = bitcast i8* %conv2d_13_b_bytes to i64* - store i64 0, i64* %603, align 1, !tbaa !264 + %604 = bitcast i8* %conv2d_13_b_bytes to i64* + store i64 0, i64* %604, align 1, !tbaa !282 %dense_1_w156 = getelementptr inbounds i8, i8* %call128, i64 432 - %604 = bitcast i8* %dense_1_w156 to %struct.Tensor** - store %struct.Tensor* %call111, %struct.Tensor** %604, align 1, !tbaa !265 + %605 = bitcast i8* %dense_1_w156 to %struct.Tensor** + store %struct.Tensor* %call111, %struct.Tensor** %605, align 1, !tbaa !283 %dense_1_w_bytes = getelementptr inbounds i8, i8* %call128, i64 440 - %605 = bitcast i8* %dense_1_w_bytes to i64* - store i64 0, i64* %605, align 1, !tbaa !266 + %606 = bitcast i8* %dense_1_w_bytes to i64* + store i64 0, i64* %606, align 1, !tbaa !284 %dense_1_b157 = getelementptr inbounds i8, i8* %call128, i64 448 - %606 = bitcast i8* %dense_1_b157 to %struct.Tensor** - store %struct.Tensor* %call115, %struct.Tensor** %606, align 1, !tbaa !267 + %607 = bitcast i8* %dense_1_b157 to %struct.Tensor** + store %struct.Tensor* %call115, %struct.Tensor** %607, align 1, !tbaa !285 %dense_1_b_bytes = getelementptr inbounds i8, i8* %call128, i64 456 - %607 = bitcast i8* %dense_1_b_bytes to i64* - store i64 0, i64* %607, align 1, !tbaa !268 + %608 = bitcast i8* %dense_1_b_bytes to i64* + store i64 0, i64* %608, align 1, !tbaa !286 %dense_2_w158 = getelementptr inbounds i8, i8* %call128, i64 464 - %608 = bitcast i8* %dense_2_w158 to %struct.Tensor** - store %struct.Tensor* %call119, %struct.Tensor** %608, align 1, !tbaa !269 + %609 = bitcast i8* %dense_2_w158 to %struct.Tensor** + store %struct.Tensor* %call119, %struct.Tensor** %609, align 1, !tbaa !287 %dense_2_w_bytes = getelementptr inbounds i8, i8* %call128, i64 472 - %609 = bitcast i8* %dense_2_w_bytes to i64* - store i64 0, i64* %609, align 1, !tbaa !270 + %610 = bitcast i8* %dense_2_w_bytes to i64* + store i64 0, i64* %610, align 1, !tbaa !288 %dense_2_b159 = getelementptr inbounds i8, i8* %call128, i64 480 - %610 = bitcast i8* %dense_2_b159 to %struct.Tensor** - store %struct.Tensor* %call123, %struct.Tensor** %610, align 1, !tbaa !271 + %611 = bitcast i8* %dense_2_b159 to %struct.Tensor** + store %struct.Tensor* %call123, %struct.Tensor** %611, align 1, !tbaa !289 %dense_2_b_bytes = getelementptr inbounds i8, i8* %call128, i64 488 - %611 = bitcast i8* %dense_2_b_bytes to i64* - store i64 0, i64* %611, align 1, !tbaa !272 + %612 = bitcast i8* %dense_2_b_bytes to i64* + store i64 0, i64* %612, align 1, !tbaa !290 + call void @startMemTracking() #2 + call void @startProfiling() #2 + %613 = load i8*, i8** %_M_p.i.i1230, align 8, !tbaa !113 + %call161 = call %struct.Tensor* @_Z18readTrainedWeightsPKcillll(i8* %613, i32 0, i64 2000, i64 3, i64 32, i64 32) + store %struct.Tensor* %call161, %struct.Tensor** %551, align 1, !tbaa !227 + store i64 0, i64* %552, align 1, !tbaa !230 %graphID = call i8* @llvm.visc.launch(i8* bitcast (%struct.out._Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m_cloned to i8*), i8* %call128, i1 false) call void @llvm.visc.wait(i8* %graphID) - %612 = load i8*, i8** %input129, align 1, !tbaa !209 - call void @hpvm_request_tensor(i8* %612, i32 0) #7 + %614 = load i8*, i8** %input129, align 1, !tbaa !227 + call void @hpvm_request_tensor(i8* %614, i32 0) #2 + %call166 = call fast float @_Z16computeAccuracy3PjPv(i32* %550, i8* %614) + call void @freeBatchMemory() #2 + call void @stopProfiling() #2 call void @llvm.visc.cleanup() - call void @_Z16computeAccuracy2PhiPv(i8* %call.i, i32 undef, i8* %612) - %613 = load i8*, i8** %_M_p.i.i1222, align 8, !tbaa !107 - %arraydecay.i.i.i.i1031 = bitcast %union.anon* %539 to i8* - %cmp.i.i.i1032 = icmp eq i8* %613, %arraydecay.i.i.i.i1031 - br i1 %cmp.i.i.i1032, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034, label %if.then.i.i1033 - -if.then.i.i1033: ; preds = %_Z10readLabelsPKci.exit - call void @_ZdlPv(i8* %613) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034: ; preds = %if.then.i.i1033, %_Z10readLabelsPKci.exit - call void @llvm.lifetime.end(i64 32, i8* nonnull %531) #7 - %614 = load i8*, i8** %_M_p.i.i1333, align 8, !tbaa !107 - %arraydecay.i.i.i.i989 = bitcast %union.anon* %522 to i8* - %cmp.i.i.i990 = icmp eq i8* %614, %arraydecay.i.i.i.i989 - br i1 %cmp.i.i.i990, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992, label %if.then.i.i991 - -if.then.i.i991: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034 - call void @_ZdlPv(i8* %614) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992: ; preds = %if.then.i.i991, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1034 - call void @llvm.lifetime.end(i64 32, i8* nonnull %514) #7 - %615 = load i8*, i8** %_M_p.i.i1481, align 8, !tbaa !107 - %arraydecay.i.i.i.i984 = bitcast %union.anon* %505 to i8* - %cmp.i.i.i985 = icmp eq i8* %615, %arraydecay.i.i.i.i984 - br i1 %cmp.i.i.i985, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987, label %if.then.i.i986 - -if.then.i.i986: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992 - call void @_ZdlPv(i8* %615) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987: ; preds = %if.then.i.i986, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit992 - call void @llvm.lifetime.end(i64 32, i8* nonnull %497) #7 - %616 = load i8*, i8** %_M_p.i.i1591, align 8, !tbaa !107 - %arraydecay.i.i.i.i942 = bitcast %union.anon* %488 to i8* - %cmp.i.i.i943 = icmp eq i8* %616, %arraydecay.i.i.i.i942 - br i1 %cmp.i.i.i943, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945, label %if.then.i.i944 - -if.then.i.i944: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987 - call void @_ZdlPv(i8* %616) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945: ; preds = %if.then.i.i944, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit987 - call void @llvm.lifetime.end(i64 32, i8* nonnull %480) #7 - %617 = load i8*, i8** %_M_p.i.i1538, align 8, !tbaa !107 - %arraydecay.i.i.i.i937 = bitcast %union.anon* %471 to i8* - %cmp.i.i.i938 = icmp eq i8* %617, %arraydecay.i.i.i.i937 - br i1 %cmp.i.i.i938, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940, label %if.then.i.i939 - -if.then.i.i939: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945 - call void @_ZdlPv(i8* %617) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940: ; preds = %if.then.i.i939, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit945 - call void @llvm.lifetime.end(i64 32, i8* nonnull %463) #7 - %618 = load i8*, i8** %_M_p.i.i1480, align 8, !tbaa !107 - %arraydecay.i.i.i.i895 = bitcast %union.anon* %454 to i8* - %cmp.i.i.i896 = icmp eq i8* %618, %arraydecay.i.i.i.i895 - br i1 %cmp.i.i.i896, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898, label %if.then.i.i897 - -if.then.i.i897: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940 - call void @_ZdlPv(i8* %618) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898: ; preds = %if.then.i.i897, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit940 - call void @llvm.lifetime.end(i64 32, i8* nonnull %446) #7 - %619 = load i8*, i8** %_M_p.i.i1443, align 8, !tbaa !107 - %arraydecay.i.i.i.i890 = bitcast %union.anon* %437 to i8* - %cmp.i.i.i891 = icmp eq i8* %619, %arraydecay.i.i.i.i890 - br i1 %cmp.i.i.i891, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893, label %if.then.i.i892 - -if.then.i.i892: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898 - call void @_ZdlPv(i8* %619) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893: ; preds = %if.then.i.i892, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit898 - call void @llvm.lifetime.end(i64 32, i8* nonnull %429) #7 - %620 = load i8*, i8** %_M_p.i.i1375, align 8, !tbaa !107 - %arraydecay.i.i.i.i848 = bitcast %union.anon* %420 to i8* - %cmp.i.i.i849 = icmp eq i8* %620, %arraydecay.i.i.i.i848 - br i1 %cmp.i.i.i849, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851, label %if.then.i.i850 - -if.then.i.i850: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893 - call void @_ZdlPv(i8* %620) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851: ; preds = %if.then.i.i850, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit893 - call void @llvm.lifetime.end(i64 32, i8* nonnull %412) #7 - %621 = load i8*, i8** %_M_p.i.i1332, align 8, !tbaa !107 - %arraydecay.i.i.i.i843 = bitcast %union.anon* %403 to i8* - %cmp.i.i.i844 = icmp eq i8* %621, %arraydecay.i.i.i.i843 - br i1 %cmp.i.i.i844, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846, label %if.then.i.i845 - -if.then.i.i845: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851 - call void @_ZdlPv(i8* %621) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846: ; preds = %if.then.i.i845, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit851 - call void @llvm.lifetime.end(i64 32, i8* nonnull %395) #7 - %622 = load i8*, i8** %_M_p.i.i1279, align 8, !tbaa !107 - %arraydecay.i.i.i.i801 = bitcast %union.anon* %386 to i8* - %cmp.i.i.i802 = icmp eq i8* %622, %arraydecay.i.i.i.i801 - br i1 %cmp.i.i.i802, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804, label %if.then.i.i803 - -if.then.i.i803: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846 - call void @_ZdlPv(i8* %622) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804: ; preds = %if.then.i.i803, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit846 - call void @llvm.lifetime.end(i64 32, i8* nonnull %378) #7 - %623 = load i8*, i8** %_M_p.i.i1221, align 8, !tbaa !107 - %arraydecay.i.i.i.i796 = bitcast %union.anon* %369 to i8* - %cmp.i.i.i797 = icmp eq i8* %623, %arraydecay.i.i.i.i796 - br i1 %cmp.i.i.i797, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799, label %if.then.i.i798 - -if.then.i.i798: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804 - call void @_ZdlPv(i8* %623) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799: ; preds = %if.then.i.i798, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit804 - call void @llvm.lifetime.end(i64 32, i8* nonnull %361) #7 - %624 = load i8*, i8** %_M_p.i.i1182, align 8, !tbaa !107 - %arraydecay.i.i.i.i754 = bitcast %union.anon* %352 to i8* - %cmp.i.i.i755 = icmp eq i8* %624, %arraydecay.i.i.i.i754 - br i1 %cmp.i.i.i755, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757, label %if.then.i.i756 - -if.then.i.i756: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799 - call void @_ZdlPv(i8* %624) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757: ; preds = %if.then.i.i756, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit799 - call void @llvm.lifetime.end(i64 32, i8* nonnull %344) #7 - %625 = load i8*, i8** %_M_p.i.i1145, align 8, !tbaa !107 - %arraydecay.i.i.i.i749 = bitcast %union.anon* %335 to i8* - %cmp.i.i.i750 = icmp eq i8* %625, %arraydecay.i.i.i.i749 - br i1 %cmp.i.i.i750, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752, label %if.then.i.i751 - -if.then.i.i751: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757 - call void @_ZdlPv(i8* %625) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752: ; preds = %if.then.i.i751, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit757 - call void @llvm.lifetime.end(i64 32, i8* nonnull %327) #7 - %626 = load i8*, i8** %_M_p.i.i1108, align 8, !tbaa !107 - %arraydecay.i.i.i.i707 = bitcast %union.anon* %318 to i8* - %cmp.i.i.i708 = icmp eq i8* %626, %arraydecay.i.i.i.i707 - br i1 %cmp.i.i.i708, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710, label %if.then.i.i709 - -if.then.i.i709: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752 - call void @_ZdlPv(i8* %626) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710: ; preds = %if.then.i.i709, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit752 - call void @llvm.lifetime.end(i64 32, i8* nonnull %310) #7 - %627 = load i8*, i8** %_M_p.i.i1071, align 8, !tbaa !107 - %arraydecay.i.i.i.i702 = bitcast %union.anon* %301 to i8* - %cmp.i.i.i703 = icmp eq i8* %627, %arraydecay.i.i.i.i702 - br i1 %cmp.i.i.i703, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705, label %if.then.i.i704 - -if.then.i.i704: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710 - call void @_ZdlPv(i8* %627) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705: ; preds = %if.then.i.i704, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit710 - call void @llvm.lifetime.end(i64 32, i8* nonnull %293) #7 - %628 = load i8*, i8** %_M_p.i.i1029, align 8, !tbaa !107 - %arraydecay.i.i.i.i660 = bitcast %union.anon* %284 to i8* - %cmp.i.i.i661 = icmp eq i8* %628, %arraydecay.i.i.i.i660 - br i1 %cmp.i.i.i661, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663, label %if.then.i.i662 - -if.then.i.i662: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705 - call void @_ZdlPv(i8* %628) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663: ; preds = %if.then.i.i662, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit705 - call void @llvm.lifetime.end(i64 32, i8* nonnull %276) #7 - %629 = load i8*, i8** %_M_p.i.i982, align 8, !tbaa !107 - %arraydecay.i.i.i.i655 = bitcast %union.anon* %267 to i8* - %cmp.i.i.i656 = icmp eq i8* %629, %arraydecay.i.i.i.i655 - br i1 %cmp.i.i.i656, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658, label %if.then.i.i657 - -if.then.i.i657: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663 - call void @_ZdlPv(i8* %629) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658: ; preds = %if.then.i.i657, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit663 - call void @llvm.lifetime.end(i64 32, i8* nonnull %259) #7 - %630 = load i8*, i8** %_M_p.i.i935, align 8, !tbaa !107 - %arraydecay.i.i.i.i613 = bitcast %union.anon* %250 to i8* - %cmp.i.i.i614 = icmp eq i8* %630, %arraydecay.i.i.i.i613 - br i1 %cmp.i.i.i614, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616, label %if.then.i.i615 - -if.then.i.i615: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658 - call void @_ZdlPv(i8* %630) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616: ; preds = %if.then.i.i615, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit658 - call void @llvm.lifetime.end(i64 32, i8* nonnull %242) #7 - %631 = load i8*, i8** %_M_p.i.i888, align 8, !tbaa !107 - %arraydecay.i.i.i.i608 = bitcast %union.anon* %233 to i8* - %cmp.i.i.i609 = icmp eq i8* %631, %arraydecay.i.i.i.i608 - br i1 %cmp.i.i.i609, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611, label %if.then.i.i610 - -if.then.i.i610: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616 - call void @_ZdlPv(i8* %631) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611: ; preds = %if.then.i.i610, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit616 - call void @llvm.lifetime.end(i64 32, i8* nonnull %225) #7 - %632 = load i8*, i8** %_M_p.i.i841, align 8, !tbaa !107 - %arraydecay.i.i.i.i566 = bitcast %union.anon* %216 to i8* - %cmp.i.i.i567 = icmp eq i8* %632, %arraydecay.i.i.i.i566 - br i1 %cmp.i.i.i567, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569, label %if.then.i.i568 - -if.then.i.i568: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611 - call void @_ZdlPv(i8* %632) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569: ; preds = %if.then.i.i568, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit611 - call void @llvm.lifetime.end(i64 32, i8* nonnull %208) #7 - %633 = load i8*, i8** %_M_p.i.i794, align 8, !tbaa !107 - %arraydecay.i.i.i.i561 = bitcast %union.anon* %199 to i8* - %cmp.i.i.i562 = icmp eq i8* %633, %arraydecay.i.i.i.i561 - br i1 %cmp.i.i.i562, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564, label %if.then.i.i563 - -if.then.i.i563: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569 - call void @_ZdlPv(i8* %633) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564: ; preds = %if.then.i.i563, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit569 - call void @llvm.lifetime.end(i64 32, i8* nonnull %191) #7 - %634 = load i8*, i8** %_M_p.i.i747, align 8, !tbaa !107 - %arraydecay.i.i.i.i519 = bitcast %union.anon* %182 to i8* - %cmp.i.i.i520 = icmp eq i8* %634, %arraydecay.i.i.i.i519 - br i1 %cmp.i.i.i520, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522, label %if.then.i.i521 - -if.then.i.i521: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564 - call void @_ZdlPv(i8* %634) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522: ; preds = %if.then.i.i521, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit564 - call void @llvm.lifetime.end(i64 32, i8* nonnull %174) #7 - %635 = load i8*, i8** %_M_p.i.i700, align 8, !tbaa !107 - %arraydecay.i.i.i.i514 = bitcast %union.anon* %165 to i8* - %cmp.i.i.i515 = icmp eq i8* %635, %arraydecay.i.i.i.i514 - br i1 %cmp.i.i.i515, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517, label %if.then.i.i516 - -if.then.i.i516: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522 - call void @_ZdlPv(i8* %635) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517: ; preds = %if.then.i.i516, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit522 - call void @llvm.lifetime.end(i64 32, i8* nonnull %157) #7 - %636 = load i8*, i8** %_M_p.i.i653, align 8, !tbaa !107 - %arraydecay.i.i.i.i472 = bitcast %union.anon* %148 to i8* - %cmp.i.i.i473 = icmp eq i8* %636, %arraydecay.i.i.i.i472 - br i1 %cmp.i.i.i473, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475, label %if.then.i.i474 - -if.then.i.i474: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517 - call void @_ZdlPv(i8* %636) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475: ; preds = %if.then.i.i474, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit517 - call void @llvm.lifetime.end(i64 32, i8* nonnull %140) #7 - %637 = load i8*, i8** %_M_p.i.i606, align 8, !tbaa !107 - %arraydecay.i.i.i.i467 = bitcast %union.anon* %131 to i8* - %cmp.i.i.i468 = icmp eq i8* %637, %arraydecay.i.i.i.i467 - br i1 %cmp.i.i.i468, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470, label %if.then.i.i469 - -if.then.i.i469: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475 - call void @_ZdlPv(i8* %637) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470: ; preds = %if.then.i.i469, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit475 - call void @llvm.lifetime.end(i64 32, i8* nonnull %123) #7 - %638 = load i8*, i8** %_M_p.i.i559, align 8, !tbaa !107 - %arraydecay.i.i.i.i425 = bitcast %union.anon* %114 to i8* - %cmp.i.i.i426 = icmp eq i8* %638, %arraydecay.i.i.i.i425 - br i1 %cmp.i.i.i426, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428, label %if.then.i.i427 - -if.then.i.i427: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470 - call void @_ZdlPv(i8* %638) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428: ; preds = %if.then.i.i427, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit470 - call void @llvm.lifetime.end(i64 32, i8* nonnull %106) #7 - %639 = load i8*, i8** %_M_p.i.i512, align 8, !tbaa !107 - %arraydecay.i.i.i.i420 = bitcast %union.anon* %97 to i8* - %cmp.i.i.i421 = icmp eq i8* %639, %arraydecay.i.i.i.i420 - br i1 %cmp.i.i.i421, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423, label %if.then.i.i422 - -if.then.i.i422: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428 - call void @_ZdlPv(i8* %639) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423: ; preds = %if.then.i.i422, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit428 - call void @llvm.lifetime.end(i64 32, i8* nonnull %89) #7 - %640 = load i8*, i8** %_M_p.i.i465, align 8, !tbaa !107 - %arraydecay.i.i.i.i378 = bitcast %union.anon* %80 to i8* - %cmp.i.i.i379 = icmp eq i8* %640, %arraydecay.i.i.i.i378 - br i1 %cmp.i.i.i379, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381, label %if.then.i.i380 - -if.then.i.i380: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423 - call void @_ZdlPv(i8* %640) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381: ; preds = %if.then.i.i380, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit423 - call void @llvm.lifetime.end(i64 32, i8* nonnull %72) #7 - %641 = load i8*, i8** %_M_p.i.i418, align 8, !tbaa !107 - %arraydecay.i.i.i.i373 = bitcast %union.anon* %63 to i8* - %cmp.i.i.i374 = icmp eq i8* %641, %arraydecay.i.i.i.i373 - br i1 %cmp.i.i.i374, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376, label %if.then.i.i375 - -if.then.i.i375: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381 - call void @_ZdlPv(i8* %641) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376: ; preds = %if.then.i.i375, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit381 - call void @llvm.lifetime.end(i64 32, i8* nonnull %55) #7 - %642 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !107 - %arraydecay.i.i.i.i332 = bitcast %union.anon* %46 to i8* - %cmp.i.i.i333 = icmp eq i8* %642, %arraydecay.i.i.i.i332 - br i1 %cmp.i.i.i333, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335, label %if.then.i.i334 - -if.then.i.i334: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376 - call void @_ZdlPv(i8* %642) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335: ; preds = %if.then.i.i334, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit376 - call void @llvm.lifetime.end(i64 32, i8* nonnull %38) #7 - %643 = load i8*, i8** %_M_p.i.i1183, align 8, !tbaa !107 - %arraydecay.i.i.i.i291 = bitcast %union.anon* %30 to i8* - %cmp.i.i.i292 = icmp eq i8* %643, %arraydecay.i.i.i.i291 - br i1 %cmp.i.i.i292, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294, label %if.then.i.i293 - -if.then.i.i293: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335 - call void @_ZdlPv(i8* %643) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294: ; preds = %if.then.i.i293, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit335 - call void @llvm.lifetime.end(i64 32, i8* nonnull %22) #7 - %644 = load i8*, i8** %_M_p.i.i1184, align 8, !tbaa !107 - %arraydecay.i.i.i.i262 = bitcast %union.anon* %14 to i8* - %cmp.i.i.i263 = icmp eq i8* %644, %arraydecay.i.i.i.i262 - br i1 %cmp.i.i.i263, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265, label %if.then.i.i264 - -if.then.i.i264: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294 - call void @_ZdlPv(i8* %644) #7 - br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265 - -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265: ; preds = %if.then.i.i264, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit294 - call void @llvm.lifetime.end(i64 32, i8* nonnull %6) #7 - %645 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !107 - %cmp.i.i.i = icmp eq i8* %645, %3 + %615 = load i8*, i8** %_M_p.i.i1231, align 8, !tbaa !113 + %arraydecay.i.i.i.i1039 = bitcast %union.anon* %539 to i8* + %cmp.i.i.i1040 = icmp eq i8* %615, %arraydecay.i.i.i.i1039 + br i1 %cmp.i.i.i1040, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042, label %if.then.i.i1041 + +if.then.i.i1041: ; preds = %_Z11readLabels3PKci.exit + call void @_ZdlPv(i8* %615) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042: ; preds = %if.then.i.i1041, %_Z11readLabels3PKci.exit + call void @llvm.lifetime.end(i64 32, i8* nonnull %531) #2 + %616 = load i8*, i8** %_M_p.i.i1379, align 8, !tbaa !113 + %arraydecay.i.i.i.i997 = bitcast %union.anon* %522 to i8* + %cmp.i.i.i998 = icmp eq i8* %616, %arraydecay.i.i.i.i997 + br i1 %cmp.i.i.i998, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000, label %if.then.i.i999 + +if.then.i.i999: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042 + call void @_ZdlPv(i8* %616) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000: ; preds = %if.then.i.i999, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1042 + call void @llvm.lifetime.end(i64 32, i8* nonnull %514) #2 + %617 = load i8*, i8** %_M_p.i.i1490, align 8, !tbaa !113 + %arraydecay.i.i.i.i992 = bitcast %union.anon* %505 to i8* + %cmp.i.i.i993 = icmp eq i8* %617, %arraydecay.i.i.i.i992 + br i1 %cmp.i.i.i993, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995, label %if.then.i.i994 + +if.then.i.i994: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000 + call void @_ZdlPv(i8* %617) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995: ; preds = %if.then.i.i994, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit1000 + call void @llvm.lifetime.end(i64 32, i8* nonnull %497) #2 + %618 = load i8*, i8** %_M_p.i.i1600, align 8, !tbaa !113 + %arraydecay.i.i.i.i950 = bitcast %union.anon* %488 to i8* + %cmp.i.i.i951 = icmp eq i8* %618, %arraydecay.i.i.i.i950 + br i1 %cmp.i.i.i951, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953, label %if.then.i.i952 + +if.then.i.i952: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995 + call void @_ZdlPv(i8* %618) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953: ; preds = %if.then.i.i952, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit995 + call void @llvm.lifetime.end(i64 32, i8* nonnull %480) #2 + %619 = load i8*, i8** %_M_p.i.i1532, align 8, !tbaa !113 + %arraydecay.i.i.i.i945 = bitcast %union.anon* %471 to i8* + %cmp.i.i.i946 = icmp eq i8* %619, %arraydecay.i.i.i.i945 + br i1 %cmp.i.i.i946, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948, label %if.then.i.i947 + +if.then.i.i947: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953 + call void @_ZdlPv(i8* %619) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948: ; preds = %if.then.i.i947, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit953 + call void @llvm.lifetime.end(i64 32, i8* nonnull %463) #2 + %620 = load i8*, i8** %_M_p.i.i1489, align 8, !tbaa !113 + %arraydecay.i.i.i.i903 = bitcast %union.anon* %454 to i8* + %cmp.i.i.i904 = icmp eq i8* %620, %arraydecay.i.i.i.i903 + br i1 %cmp.i.i.i904, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906, label %if.then.i.i905 + +if.then.i.i905: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948 + call void @_ZdlPv(i8* %620) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906: ; preds = %if.then.i.i905, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit948 + call void @llvm.lifetime.end(i64 32, i8* nonnull %446) #2 + %621 = load i8*, i8** %_M_p.i.i1436, align 8, !tbaa !113 + %arraydecay.i.i.i.i898 = bitcast %union.anon* %437 to i8* + %cmp.i.i.i899 = icmp eq i8* %621, %arraydecay.i.i.i.i898 + br i1 %cmp.i.i.i899, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901, label %if.then.i.i900 + +if.then.i.i900: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906 + call void @_ZdlPv(i8* %621) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901: ; preds = %if.then.i.i900, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit906 + call void @llvm.lifetime.end(i64 32, i8* nonnull %429) #2 + %622 = load i8*, i8** %_M_p.i.i1378, align 8, !tbaa !113 + %arraydecay.i.i.i.i856 = bitcast %union.anon* %420 to i8* + %cmp.i.i.i857 = icmp eq i8* %622, %arraydecay.i.i.i.i856 + br i1 %cmp.i.i.i857, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859, label %if.then.i.i858 + +if.then.i.i858: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901 + call void @_ZdlPv(i8* %622) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859: ; preds = %if.then.i.i858, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit901 + call void @llvm.lifetime.end(i64 32, i8* nonnull %412) #2 + %623 = load i8*, i8** %_M_p.i.i1341, align 8, !tbaa !113 + %arraydecay.i.i.i.i851 = bitcast %union.anon* %403 to i8* + %cmp.i.i.i852 = icmp eq i8* %623, %arraydecay.i.i.i.i851 + br i1 %cmp.i.i.i852, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854, label %if.then.i.i853 + +if.then.i.i853: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859 + call void @_ZdlPv(i8* %623) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854: ; preds = %if.then.i.i853, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit859 + call void @llvm.lifetime.end(i64 32, i8* nonnull %395) #2 + %624 = load i8*, i8** %_M_p.i.i1273, align 8, !tbaa !113 + %arraydecay.i.i.i.i809 = bitcast %union.anon* %386 to i8* + %cmp.i.i.i810 = icmp eq i8* %624, %arraydecay.i.i.i.i809 + br i1 %cmp.i.i.i810, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812, label %if.then.i.i811 + +if.then.i.i811: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854 + call void @_ZdlPv(i8* %624) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812: ; preds = %if.then.i.i811, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit854 + call void @llvm.lifetime.end(i64 32, i8* nonnull %378) #2 + %625 = load i8*, i8** %_M_p.i.i1229, align 8, !tbaa !113 + %arraydecay.i.i.i.i804 = bitcast %union.anon* %369 to i8* + %cmp.i.i.i805 = icmp eq i8* %625, %arraydecay.i.i.i.i804 + br i1 %cmp.i.i.i805, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807, label %if.then.i.i806 + +if.then.i.i806: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812 + call void @_ZdlPv(i8* %625) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807: ; preds = %if.then.i.i806, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit812 + call void @llvm.lifetime.end(i64 32, i8* nonnull %361) #2 + %626 = load i8*, i8** %_M_p.i.i1191, align 8, !tbaa !113 + %arraydecay.i.i.i.i762 = bitcast %union.anon* %352 to i8* + %cmp.i.i.i763 = icmp eq i8* %626, %arraydecay.i.i.i.i762 + br i1 %cmp.i.i.i763, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765, label %if.then.i.i764 + +if.then.i.i764: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807 + call void @_ZdlPv(i8* %626) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765: ; preds = %if.then.i.i764, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit807 + call void @llvm.lifetime.end(i64 32, i8* nonnull %344) #2 + %627 = load i8*, i8** %_M_p.i.i1154, align 8, !tbaa !113 + %arraydecay.i.i.i.i757 = bitcast %union.anon* %335 to i8* + %cmp.i.i.i758 = icmp eq i8* %627, %arraydecay.i.i.i.i757 + br i1 %cmp.i.i.i758, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760, label %if.then.i.i759 + +if.then.i.i759: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765 + call void @_ZdlPv(i8* %627) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760: ; preds = %if.then.i.i759, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit765 + call void @llvm.lifetime.end(i64 32, i8* nonnull %327) #2 + %628 = load i8*, i8** %_M_p.i.i1117, align 8, !tbaa !113 + %arraydecay.i.i.i.i715 = bitcast %union.anon* %318 to i8* + %cmp.i.i.i716 = icmp eq i8* %628, %arraydecay.i.i.i.i715 + br i1 %cmp.i.i.i716, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718, label %if.then.i.i717 + +if.then.i.i717: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760 + call void @_ZdlPv(i8* %628) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718: ; preds = %if.then.i.i717, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit760 + call void @llvm.lifetime.end(i64 32, i8* nonnull %310) #2 + %629 = load i8*, i8** %_M_p.i.i1079, align 8, !tbaa !113 + %arraydecay.i.i.i.i710 = bitcast %union.anon* %301 to i8* + %cmp.i.i.i711 = icmp eq i8* %629, %arraydecay.i.i.i.i710 + br i1 %cmp.i.i.i711, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713, label %if.then.i.i712 + +if.then.i.i712: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718 + call void @_ZdlPv(i8* %629) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713: ; preds = %if.then.i.i712, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit718 + call void @llvm.lifetime.end(i64 32, i8* nonnull %293) #2 + %630 = load i8*, i8** %_M_p.i.i1037, align 8, !tbaa !113 + %arraydecay.i.i.i.i668 = bitcast %union.anon* %284 to i8* + %cmp.i.i.i669 = icmp eq i8* %630, %arraydecay.i.i.i.i668 + br i1 %cmp.i.i.i669, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671, label %if.then.i.i670 + +if.then.i.i670: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713 + call void @_ZdlPv(i8* %630) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671: ; preds = %if.then.i.i670, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit713 + call void @llvm.lifetime.end(i64 32, i8* nonnull %276) #2 + %631 = load i8*, i8** %_M_p.i.i990, align 8, !tbaa !113 + %arraydecay.i.i.i.i663 = bitcast %union.anon* %267 to i8* + %cmp.i.i.i664 = icmp eq i8* %631, %arraydecay.i.i.i.i663 + br i1 %cmp.i.i.i664, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666, label %if.then.i.i665 + +if.then.i.i665: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671 + call void @_ZdlPv(i8* %631) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666: ; preds = %if.then.i.i665, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit671 + call void @llvm.lifetime.end(i64 32, i8* nonnull %259) #2 + %632 = load i8*, i8** %_M_p.i.i943, align 8, !tbaa !113 + %arraydecay.i.i.i.i621 = bitcast %union.anon* %250 to i8* + %cmp.i.i.i622 = icmp eq i8* %632, %arraydecay.i.i.i.i621 + br i1 %cmp.i.i.i622, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624, label %if.then.i.i623 + +if.then.i.i623: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666 + call void @_ZdlPv(i8* %632) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624: ; preds = %if.then.i.i623, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit666 + call void @llvm.lifetime.end(i64 32, i8* nonnull %242) #2 + %633 = load i8*, i8** %_M_p.i.i896, align 8, !tbaa !113 + %arraydecay.i.i.i.i616 = bitcast %union.anon* %233 to i8* + %cmp.i.i.i617 = icmp eq i8* %633, %arraydecay.i.i.i.i616 + br i1 %cmp.i.i.i617, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619, label %if.then.i.i618 + +if.then.i.i618: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624 + call void @_ZdlPv(i8* %633) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619: ; preds = %if.then.i.i618, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit624 + call void @llvm.lifetime.end(i64 32, i8* nonnull %225) #2 + %634 = load i8*, i8** %_M_p.i.i849, align 8, !tbaa !113 + %arraydecay.i.i.i.i574 = bitcast %union.anon* %216 to i8* + %cmp.i.i.i575 = icmp eq i8* %634, %arraydecay.i.i.i.i574 + br i1 %cmp.i.i.i575, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577, label %if.then.i.i576 + +if.then.i.i576: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619 + call void @_ZdlPv(i8* %634) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577: ; preds = %if.then.i.i576, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit619 + call void @llvm.lifetime.end(i64 32, i8* nonnull %208) #2 + %635 = load i8*, i8** %_M_p.i.i802, align 8, !tbaa !113 + %arraydecay.i.i.i.i569 = bitcast %union.anon* %199 to i8* + %cmp.i.i.i570 = icmp eq i8* %635, %arraydecay.i.i.i.i569 + br i1 %cmp.i.i.i570, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572, label %if.then.i.i571 + +if.then.i.i571: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577 + call void @_ZdlPv(i8* %635) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572: ; preds = %if.then.i.i571, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit577 + call void @llvm.lifetime.end(i64 32, i8* nonnull %191) #2 + %636 = load i8*, i8** %_M_p.i.i755, align 8, !tbaa !113 + %arraydecay.i.i.i.i527 = bitcast %union.anon* %182 to i8* + %cmp.i.i.i528 = icmp eq i8* %636, %arraydecay.i.i.i.i527 + br i1 %cmp.i.i.i528, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530, label %if.then.i.i529 + +if.then.i.i529: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572 + call void @_ZdlPv(i8* %636) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530: ; preds = %if.then.i.i529, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit572 + call void @llvm.lifetime.end(i64 32, i8* nonnull %174) #2 + %637 = load i8*, i8** %_M_p.i.i708, align 8, !tbaa !113 + %arraydecay.i.i.i.i522 = bitcast %union.anon* %165 to i8* + %cmp.i.i.i523 = icmp eq i8* %637, %arraydecay.i.i.i.i522 + br i1 %cmp.i.i.i523, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525, label %if.then.i.i524 + +if.then.i.i524: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530 + call void @_ZdlPv(i8* %637) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525: ; preds = %if.then.i.i524, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit530 + call void @llvm.lifetime.end(i64 32, i8* nonnull %157) #2 + %638 = load i8*, i8** %_M_p.i.i661, align 8, !tbaa !113 + %arraydecay.i.i.i.i480 = bitcast %union.anon* %148 to i8* + %cmp.i.i.i481 = icmp eq i8* %638, %arraydecay.i.i.i.i480 + br i1 %cmp.i.i.i481, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483, label %if.then.i.i482 + +if.then.i.i482: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525 + call void @_ZdlPv(i8* %638) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483: ; preds = %if.then.i.i482, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit525 + call void @llvm.lifetime.end(i64 32, i8* nonnull %140) #2 + %639 = load i8*, i8** %_M_p.i.i614, align 8, !tbaa !113 + %arraydecay.i.i.i.i475 = bitcast %union.anon* %131 to i8* + %cmp.i.i.i476 = icmp eq i8* %639, %arraydecay.i.i.i.i475 + br i1 %cmp.i.i.i476, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478, label %if.then.i.i477 + +if.then.i.i477: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483 + call void @_ZdlPv(i8* %639) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478: ; preds = %if.then.i.i477, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit483 + call void @llvm.lifetime.end(i64 32, i8* nonnull %123) #2 + %640 = load i8*, i8** %_M_p.i.i567, align 8, !tbaa !113 + %arraydecay.i.i.i.i433 = bitcast %union.anon* %114 to i8* + %cmp.i.i.i434 = icmp eq i8* %640, %arraydecay.i.i.i.i433 + br i1 %cmp.i.i.i434, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436, label %if.then.i.i435 + +if.then.i.i435: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478 + call void @_ZdlPv(i8* %640) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436: ; preds = %if.then.i.i435, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit478 + call void @llvm.lifetime.end(i64 32, i8* nonnull %106) #2 + %641 = load i8*, i8** %_M_p.i.i520, align 8, !tbaa !113 + %arraydecay.i.i.i.i428 = bitcast %union.anon* %97 to i8* + %cmp.i.i.i429 = icmp eq i8* %641, %arraydecay.i.i.i.i428 + br i1 %cmp.i.i.i429, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431, label %if.then.i.i430 + +if.then.i.i430: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436 + call void @_ZdlPv(i8* %641) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431: ; preds = %if.then.i.i430, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit436 + call void @llvm.lifetime.end(i64 32, i8* nonnull %89) #2 + %642 = load i8*, i8** %_M_p.i.i473, align 8, !tbaa !113 + %arraydecay.i.i.i.i386 = bitcast %union.anon* %80 to i8* + %cmp.i.i.i387 = icmp eq i8* %642, %arraydecay.i.i.i.i386 + br i1 %cmp.i.i.i387, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389, label %if.then.i.i388 + +if.then.i.i388: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431 + call void @_ZdlPv(i8* %642) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389: ; preds = %if.then.i.i388, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit431 + call void @llvm.lifetime.end(i64 32, i8* nonnull %72) #2 + %643 = load i8*, i8** %_M_p.i.i426, align 8, !tbaa !113 + %arraydecay.i.i.i.i381 = bitcast %union.anon* %63 to i8* + %cmp.i.i.i382 = icmp eq i8* %643, %arraydecay.i.i.i.i381 + br i1 %cmp.i.i.i382, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384, label %if.then.i.i383 + +if.then.i.i383: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389 + call void @_ZdlPv(i8* %643) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384: ; preds = %if.then.i.i383, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit389 + call void @llvm.lifetime.end(i64 32, i8* nonnull %55) #2 + %644 = load i8*, i8** %_M_p.i.i, align 8, !tbaa !113 + %arraydecay.i.i.i.i340 = bitcast %union.anon* %46 to i8* + %cmp.i.i.i341 = icmp eq i8* %644, %arraydecay.i.i.i.i340 + br i1 %cmp.i.i.i341, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343, label %if.then.i.i342 + +if.then.i.i342: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384 + call void @_ZdlPv(i8* %644) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343: ; preds = %if.then.i.i342, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit384 + call void @llvm.lifetime.end(i64 32, i8* nonnull %38) #2 + %645 = load i8*, i8** %_M_p.i.i1192, align 8, !tbaa !113 + %arraydecay.i.i.i.i299 = bitcast %union.anon* %30 to i8* + %cmp.i.i.i300 = icmp eq i8* %645, %arraydecay.i.i.i.i299 + br i1 %cmp.i.i.i300, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302, label %if.then.i.i301 + +if.then.i.i301: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343 + call void @_ZdlPv(i8* %645) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302: ; preds = %if.then.i.i301, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit343 + call void @llvm.lifetime.end(i64 32, i8* nonnull %22) #2 + %646 = load i8*, i8** %_M_p.i.i1230, align 8, !tbaa !113 + %arraydecay.i.i.i.i270 = bitcast %union.anon* %14 to i8* + %cmp.i.i.i271 = icmp eq i8* %646, %arraydecay.i.i.i.i270 + br i1 %cmp.i.i.i271, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273, label %if.then.i.i272 + +if.then.i.i272: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302 + call void @_ZdlPv(i8* %646) #2 + br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273 + +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273: ; preds = %if.then.i.i272, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit302 + call void @llvm.lifetime.end(i64 32, i8* nonnull %6) #2 + %647 = load i8*, i8** %_M_p.i13.i.i.i.i, align 8, !tbaa !113 + %cmp.i.i.i = icmp eq i8* %647, %3 br i1 %cmp.i.i.i, label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit, label %if.then.i.i -if.then.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265 - call void @_ZdlPv(i8* %645) #7 +if.then.i.i: ; preds = %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273 + call void @_ZdlPv(i8* %647) #2 br label %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit -_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit265 - call void @llvm.lifetime.end(i64 32, i8* nonnull %0) #7 +_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit: ; preds = %if.then.i.i, %_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEED2Ev.exit273 + call void @llvm.lifetime.end(i64 32, i8* nonnull %0) #2 ret i32 0 } +declare void @startMemTracking() local_unnamed_addr #0 + +declare void @startProfiling() local_unnamed_addr #0 + +declare void @freeBatchMemory() local_unnamed_addr #0 + +declare void @stopProfiling() local_unnamed_addr #0 + +; Function Attrs: nounwind readnone +declare float @log10f(float) local_unnamed_addr #8 + +; Function Attrs: nounwind readnone +declare float @sqrtf(float) local_unnamed_addr #8 + ; Function Attrs: nobuiltin nounwind -declare void @_ZdlPv(i8*) local_unnamed_addr #6 +declare void @_ZdlPv(i8*) local_unnamed_addr #9 -declare void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"*, %"class.std::basic_streambuf"*) local_unnamed_addr #3 +declare void @_ZNSt9basic_iosIcSt11char_traitsIcEE4initEPSt15basic_streambufIcS1_E(%"class.std::basic_ios"*, %"class.std::basic_streambuf"*) local_unnamed_addr #0 ; Function Attrs: nounwind -declare void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"*) unnamed_addr #2 +declare void @_ZNSt8ios_baseC2Ev(%"class.std::ios_base"*) unnamed_addr #1 ; Function Attrs: nounwind -declare void @_ZNSt6localeC1Ev(%"class.std::locale"*) unnamed_addr #2 +declare void @_ZNSt6localeC1Ev(%"class.std::locale"*) unnamed_addr #1 ; Function Attrs: nounwind -declare void @_ZNSt6localeD1Ev(%"class.std::locale"*) unnamed_addr #2 +declare void @_ZNSt6localeD1Ev(%"class.std::locale"*) unnamed_addr #1 ; Function Attrs: nounwind -declare void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"*) unnamed_addr #2 +declare void @_ZNSt8ios_baseD2Ev(%"class.std::ios_base"*) unnamed_addr #1 -declare dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) local_unnamed_addr #3 +declare dereferenceable(272) %"class.std::basic_ostream"* @_ZNSo9_M_insertIdEERSoT_(%"class.std::basic_ostream"*, double) local_unnamed_addr #0 ; Function Attrs: nounwind uwtable -declare void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* noalias sret, %"class.std::__cxx11::basic_stringbuf"*) local_unnamed_addr #0 align 2 +declare void @_ZNKSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEE3strEv(%"class.std::__cxx11::basic_string"* noalias sret, %"class.std::__cxx11::basic_stringbuf"*) local_unnamed_addr #3 align 2 + +declare i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"*, i64* dereferenceable(8), i64) local_unnamed_addr #0 -declare i8* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm(%"class.std::__cxx11::basic_string"*, i64* dereferenceable(8), i64) local_unnamed_addr #3 +; Function Attrs: noreturn +declare void @_ZSt17__throw_bad_allocv() local_unnamed_addr #10 + +; Function Attrs: nobuiltin +declare noalias nonnull i8* @_Znwm(i64) local_unnamed_addr #11 ; Function Attrs: argmemonly nounwind -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1 +declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #4 -declare dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"*, i64, i64, i8*, i64) local_unnamed_addr #3 +; Function Attrs: nounwind uwtable +define linkonce_odr void @_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_(%struct.ClassProb* %__first.coerce, %struct.ClassProb* %__last.coerce, i64 %__depth_limit, i1 (i64, i64)* %__comp.coerce) local_unnamed_addr #3 comdat { +entry: + %0 = ptrtoint %struct.ClassProb* %__first.coerce to i64 + %1 = ptrtoint %struct.ClassProb* %__last.coerce to i64 + %sub.ptr.sub.i33 = sub i64 %1, %0 + %cmp35 = icmp sgt i64 %sub.ptr.sub.i33, 128 + br i1 %cmp35, label %while.body.lr.ph, label %while.end + +while.body.lr.ph: ; preds = %entry + %add.ptr.i33.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 1 + %agg.tmp.sroa.0.0..sroa_cast.i.i36.i = bitcast %struct.ClassProb* %add.ptr.i33.i to i64* + %.sink95.i.i = bitcast %struct.ClassProb* %__first.coerce to i64* + br label %while.body + +while.body: ; preds = %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit, %while.body.lr.ph + %sub.ptr.div.i39.in = phi i64 [ %sub.ptr.sub.i33, %while.body.lr.ph ], [ %sub.ptr.sub.i, %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit ] + %__depth_limit.addr.037 = phi i64 [ %__depth_limit, %while.body.lr.ph ], [ %dec, %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit ] + %__first.sroa.0.1.i.i.sink36 = phi %struct.ClassProb* [ %__last.coerce, %while.body.lr.ph ], [ %__first.sroa.0.1.i.i, %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit ] + %cmp3 = icmp eq i64 %__depth_limit.addr.037, 0 + br i1 %cmp3, label %while.body.i.preheader.i, label %if.end + +while.body.i.preheader.i: ; preds = %while.body + tail call void @_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_(%struct.ClassProb* %__first.coerce, %struct.ClassProb* %__first.sroa.0.1.i.i.sink36, i1 (i64, i64)* %__comp.coerce) #2 + br label %while.body.i.i + +while.body.i.i: ; preds = %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i, %while.body.i.preheader.i + %__last.sroa.0.0.in14.i.i = phi %struct.ClassProb* [ %incdec.ptr.i.i17.i, %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i ], [ %__first.sroa.0.1.i.i.sink36, %while.body.i.preheader.i ] + %incdec.ptr.i.i17.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__last.sroa.0.0.in14.i.i, i64 -1 + %2 = ptrtoint %struct.ClassProb* %incdec.ptr.i.i17.i to i64 + %3 = bitcast %struct.ClassProb* %incdec.ptr.i.i17.i to i64* + %4 = load i64, i64* %3, align 4 + %5 = load i64, i64* %.sink95.i.i, align 4 + store i64 %5, i64* %3, align 4 + %sub.ptr.sub.i.i18.i = sub i64 %2, %0 + %sub.ptr.div.i.i.i = ashr exact i64 %sub.ptr.sub.i.i18.i, 3 + %sub.i.i.i = add nsw i64 %sub.ptr.div.i.i.i, -1 + %div.i.i.i = sdiv i64 %sub.i.i.i, 2 + %cmp84.i.i.i = icmp sgt i64 %sub.i.i.i, 1 + br i1 %cmp84.i.i.i, label %while.body.i.i.i.preheader, label %while.end.i.i.i + +while.body.i.i.i.preheader: ; preds = %while.body.i.i + br label %while.body.i.i.i + +while.body.i.i.i: ; preds = %while.body.i.i.i, %while.body.i.i.i.preheader + %__secondChild.085.i.i.i = phi i64 [ %dec.mul.i.i.i, %while.body.i.i.i ], [ 0, %while.body.i.i.i.preheader ] + %add.i.i.i = shl i64 %__secondChild.085.i.i.i, 1 + %mul.i.i.i = add i64 %add.i.i.i, 2 + %add.ptr.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %mul.i.i.i + %sub4.i.i.i = or i64 %add.i.i.i, 1 + %add.ptr.i66.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub4.i.i.i + %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i = bitcast %struct.ClassProb* %add.ptr.i.i.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i.i.i = bitcast %struct.ClassProb* %add.ptr.i66.i.i.i to i64* + %agg.tmp3.sroa.0.0.copyload.i.i.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i.i.i, align 4 + %call5.i.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i.i.i) #2 + %dec.mul.i.i.i = select i1 %call5.i.i.i.i, i64 %sub4.i.i.i, i64 %mul.i.i.i + %add.ptr.i78.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %dec.mul.i.i.i + %add.ptr.i75.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.085.i.i.i + %6 = bitcast %struct.ClassProb* %add.ptr.i78.i.i.i to i64* + %7 = bitcast %struct.ClassProb* %add.ptr.i75.i.i.i to i64* + %8 = load i64, i64* %6, align 4 + store i64 %8, i64* %7, align 4 + %cmp.i.i19.i = icmp slt i64 %dec.mul.i.i.i, %div.i.i.i + br i1 %cmp.i.i19.i, label %while.body.i.i.i, label %while.end.i.i.i.loopexit + +while.end.i.i.i.loopexit: ; preds = %while.body.i.i.i + br label %while.end.i.i.i + +while.end.i.i.i: ; preds = %while.end.i.i.i.loopexit, %while.body.i.i + %__secondChild.0.lcssa.i.i.i = phi i64 [ 0, %while.body.i.i ], [ %dec.mul.i.i.i, %while.end.i.i.i.loopexit ] + %and.i.i.i = and i64 %sub.ptr.div.i.i.i, 1 + %cmp18.i.i.i = icmp eq i64 %and.i.i.i, 0 + br i1 %cmp18.i.i.i, label %land.lhs.true.i.i.i, label %if.end36.i.i.i + +land.lhs.true.i.i.i: ; preds = %while.end.i.i.i + %sub19.i.i.i = add nsw i64 %sub.ptr.div.i.i.i, -2 + %div20.i.i.i = sdiv i64 %sub19.i.i.i, 2 + %cmp21.i.i.i = icmp eq i64 %__secondChild.0.lcssa.i.i.i, %div20.i.i.i + br i1 %cmp21.i.i.i, label %if.then22.i.i.i, label %if.end36.i.i.i + +if.then22.i.i.i: ; preds = %land.lhs.true.i.i.i + %add23.i.i.i = shl i64 %__secondChild.0.lcssa.i.i.i, 1 + %sub25.i.i.i = or i64 %add23.i.i.i, 1 + %add.ptr.i72.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub25.i.i.i + %add.ptr.i69.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.0.lcssa.i.i.i + %9 = bitcast %struct.ClassProb* %add.ptr.i72.i.i.i to i64* + %10 = bitcast %struct.ClassProb* %add.ptr.i69.i.i.i to i64* + %11 = load i64, i64* %9, align 4 + store i64 %11, i64* %10, align 4 + br label %if.end36.i.i.i + +if.end36.i.i.i: ; preds = %if.then22.i.i.i, %land.lhs.true.i.i.i, %while.end.i.i.i + %__holeIndex.addr.1.i.i.i = phi i64 [ %sub25.i.i.i, %if.then22.i.i.i ], [ %__secondChild.0.lcssa.i.i.i, %land.lhs.true.i.i.i ], [ %__secondChild.0.lcssa.i.i.i, %while.end.i.i.i ] + %cmp42.i.i.i.i = icmp sgt i64 %__holeIndex.addr.1.i.i.i, 0 + br i1 %cmp42.i.i.i.i, label %land.rhs.i.i.i.i.preheader, label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i + +land.rhs.i.i.i.i.preheader: ; preds = %if.end36.i.i.i + br label %land.rhs.i.i.i.i + +land.rhs.i.i.i.i: ; preds = %while.body.i.i.i.i, %land.rhs.i.i.i.i.preheader + %__parent.044.in.in.i.i.i.i = phi i64 [ %__parent.044.i.i.i.i, %while.body.i.i.i.i ], [ %__holeIndex.addr.1.i.i.i, %land.rhs.i.i.i.i.preheader ] + %__parent.044.in.i.i.i.i = add nsw i64 %__parent.044.in.in.i.i.i.i, -1 + %__parent.044.i.i.i.i = sdiv i64 %__parent.044.in.i.i.i.i, 2 + %add.ptr.i.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.i.i.i.i + %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i.i = bitcast %struct.ClassProb* %add.ptr.i.i.i.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i.i, align 4 + %call3.i.i.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i.i.i, i64 %4) #2 + br i1 %call3.i.i.i.i.i, label %while.body.i.i.i.i, label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit + +while.body.i.i.i.i: ; preds = %land.rhs.i.i.i.i + %add.ptr.i32.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.in.in.i.i.i.i + %12 = bitcast %struct.ClassProb* %add.ptr.i32.i.i.i.i to i64* + %13 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.i.i, align 4 + store i64 %13, i64* %12, align 4 + %cmp.i.i.i.i = icmp sgt i64 %__parent.044.in.i.i.i.i, 1 + br i1 %cmp.i.i.i.i, label %land.rhs.i.i.i.i, label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit + +_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit: ; preds = %while.body.i.i.i.i, %land.rhs.i.i.i.i + %__parent.0.in.in.lcssa.i.i.i.i.ph = phi i64 [ %__parent.044.i.i.i.i, %while.body.i.i.i.i ], [ %__parent.044.in.in.i.i.i.i, %land.rhs.i.i.i.i ] + br label %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i + +_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i: ; preds = %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit, %if.end36.i.i.i + %__parent.0.in.in.lcssa.i.i.i.i = phi i64 [ %__holeIndex.addr.1.i.i.i, %if.end36.i.i.i ], [ %__parent.0.in.in.lcssa.i.i.i.i.ph, %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i.loopexit ] + %add.ptr.i29.i.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.in.in.lcssa.i.i.i.i + %14 = bitcast %struct.ClassProb* %add.ptr.i29.i.i.i.i to i64* + store i64 %4, i64* %14, align 4 + %cmp.i.i = icmp sgt i64 %sub.ptr.sub.i.i18.i, 8 + br i1 %cmp.i.i, label %while.body.i.i, label %while.end.loopexit + +if.end: ; preds = %while.body + %sub.ptr.div.i3943 = lshr i64 %sub.ptr.div.i39.in, 4 + %dec = add nsw i64 %__depth_limit.addr.037, -1 + %add.ptr.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub.ptr.div.i3943 + %add.ptr.i42.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.sroa.0.1.i.i.sink36, i64 -1 + %agg.tmp.sroa.0.0.copyload.i.i37.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i = bitcast %struct.ClassProb* %add.ptr.i.i to i64* + %agg.tmp3.sroa.0.0.copyload.i.i39.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, align 4 + %call5.i.i40.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i37.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i39.i) #2 + br i1 %call5.i.i40.i, label %if.then.i.i, label %if.else34.i.i + +if.then.i.i: ; preds = %if.end + %agg.tmp.sroa.0.0.copyload.i66.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i = bitcast %struct.ClassProb* %add.ptr.i42.i to i64* + %agg.tmp3.sroa.0.0.copyload.i68.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i, align 4 + %call5.i69.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i66.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i68.i.i) #2 + br i1 %call5.i69.i.i, label %while.body.i.i28.preheader, label %if.else.i.i + +if.else.i.i: ; preds = %if.then.i.i + %agg.tmp.sroa.0.0.copyload.i78.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i80.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i, align 4 + %call5.i81.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i78.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i80.i.i) #2 + %agg.tmp3.sroa.0.0..sroa_cast.i67.agg.tmp.sroa.0.0..sroa_cast.i.i.i = select i1 %call5.i81.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i67.i.i, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i + br label %while.body.i.i28.preheader + +if.else34.i.i: ; preds = %if.end + %agg.tmp.sroa.0.0.copyload.i84.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i = bitcast %struct.ClassProb* %add.ptr.i42.i to i64* + %agg.tmp3.sroa.0.0.copyload.i86.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i, align 4 + %call5.i87.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i84.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i86.i.i) #2 + br i1 %call5.i87.i.i, label %while.body.i.i28.preheader, label %if.else45.i.i + +if.else45.i.i: ; preds = %if.else34.i.i + %agg.tmp.sroa.0.0.copyload.i72.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i74.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i, align 4 + %call5.i75.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i72.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i74.i.i) #2 + %agg.tmp3.sroa.0.0..sroa_cast.i85.agg.tmp3.sroa.0.0..sroa_cast.i.i.i = select i1 %call5.i75.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i85.i.i, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i + br label %while.body.i.i28.preheader + +while.body.i.i28.preheader: ; preds = %if.else45.i.i, %if.else34.i.i, %if.else.i.i, %if.then.i.i + %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i.ph = phi i64* [ %agg.tmp3.sroa.0.0..sroa_cast.i.i38.i, %if.then.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i67.agg.tmp.sroa.0.0..sroa_cast.i.i.i, %if.else.i.i ], [ %agg.tmp.sroa.0.0..sroa_cast.i.i36.i, %if.else34.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i85.agg.tmp3.sroa.0.0..sroa_cast.i.i.i, %if.else45.i.i ] + br label %while.body.i.i28 + +while.body.i.i28: ; preds = %while.end19.i.i, %while.body.i.i28.preheader + %agg.tmp.sroa.0.0..sroa_cast.i.i.sink47.i = phi i64* [ %agg.tmp.sroa.0.0..sroa_cast.i.i.i, %while.end19.i.i ], [ %.sink95.i.i, %while.body.i.i28.preheader ] + %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i = phi i64* [ %agg.tmp3.sroa.0.0..sroa_cast.i34.i.i, %while.end19.i.i ], [ %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i.ph, %while.body.i.i28.preheader ] + %__last.sroa.0.0.i.i = phi %struct.ClassProb* [ %incdec.ptr.i30.i.i, %while.end19.i.i ], [ %__first.sroa.0.1.i.i.sink36, %while.body.i.i28.preheader ] + %__first.sroa.0.0.i.i = phi %struct.ClassProb* [ %incdec.ptr.i.i.i, %while.end19.i.i ], [ %add.ptr.i33.i, %while.body.i.i28.preheader ] + %15 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.sink47.i, align 4 + %16 = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i, align 4 + store i64 %16, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.sink47.i, align 4 + store i64 %15, i64* %agg.tmp3.sroa.0.0..sroa_cast.i34.i.sink46.i, align 4 + br label %while.cond4.i.i + +while.cond4.i.i: ; preds = %while.cond4.i.i, %while.body.i.i28 + %__first.sroa.0.1.i.i = phi %struct.ClassProb* [ %__first.sroa.0.0.i.i, %while.body.i.i28 ], [ %incdec.ptr.i.i.i, %while.cond4.i.i ] + %agg.tmp.sroa.0.0..sroa_cast.i.i.i = bitcast %struct.ClassProb* %__first.sroa.0.1.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + %agg.tmp3.sroa.0.0.copyload.i.i.i = load i64, i64* %.sink95.i.i, align 4 + %call5.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i.i) #2 + %incdec.ptr.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.sroa.0.1.i.i, i64 1 + br i1 %call5.i.i.i, label %while.cond4.i.i, label %while.cond11.i.i.preheader + +while.cond11.i.i.preheader: ; preds = %while.cond4.i.i + br label %while.cond11.i.i + +while.cond11.i.i: ; preds = %while.cond11.i.i, %while.cond11.i.i.preheader + %__last.sroa.0.1.sink.i.i = phi %struct.ClassProb* [ %incdec.ptr.i30.i.i, %while.cond11.i.i ], [ %__last.sroa.0.0.i.i, %while.cond11.i.i.preheader ] + %incdec.ptr.i30.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__last.sroa.0.1.sink.i.i, i64 -1 + %agg.tmp.sroa.0.0.copyload.i33.i.i = load i64, i64* %.sink95.i.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i34.i.i = bitcast %struct.ClassProb* %incdec.ptr.i30.i.i to i64* + %agg.tmp3.sroa.0.0.copyload.i35.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i34.i.i, align 4 + %call5.i36.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i33.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i35.i.i) #2 + br i1 %call5.i36.i.i, label %while.cond11.i.i, label %while.end19.i.i + +while.end19.i.i: ; preds = %while.cond11.i.i + %cmp.i.i.i = icmp ult %struct.ClassProb* %__first.sroa.0.1.i.i, %incdec.ptr.i30.i.i + br i1 %cmp.i.i.i, label %while.body.i.i28, label %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit + +_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit: ; preds = %while.end19.i.i + tail call void @_ZSt16__introsort_loopIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElNS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_T1_(%struct.ClassProb* %__first.sroa.0.1.i.i, %struct.ClassProb* %__first.sroa.0.1.i.i.sink36, i64 %dec, i1 (i64, i64)* %__comp.coerce) + %17 = ptrtoint %struct.ClassProb* %__first.sroa.0.1.i.i to i64 + %sub.ptr.sub.i = sub i64 %17, %0 + %cmp = icmp sgt i64 %sub.ptr.sub.i, 128 + br i1 %cmp, label %while.body, label %while.end.loopexit48 + +while.end.loopexit: ; preds = %_ZSt10__pop_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_SD_T0_.exit.i + br label %while.end + +while.end.loopexit48: ; preds = %_ZSt27__unguarded_partition_pivotIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEET_SD_SD_T0_.exit + br label %while.end + +while.end: ; preds = %while.end.loopexit48, %while.end.loopexit, %entry + ret void +} + +; Function Attrs: nounwind uwtable +define linkonce_odr void @_ZSt11__make_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEENS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_SD_T0_(%struct.ClassProb* %__first.coerce, %struct.ClassProb* %__last.coerce, i1 (i64, i64)* %__comp.coerce) local_unnamed_addr #3 comdat { +entry: + %0 = ptrtoint %struct.ClassProb* %__first.coerce to i64 + %1 = ptrtoint %struct.ClassProb* %__last.coerce to i64 + %sub.ptr.sub.i = sub i64 %1, %0 + %sub.ptr.div.i = ashr exact i64 %sub.ptr.sub.i, 3 + %cmp = icmp slt i64 %sub.ptr.sub.i, 16 + br i1 %cmp, label %return, label %if.end + +if.end: ; preds = %entry + %sub = add nsw i64 %sub.ptr.div.i, -2 + %div = sdiv i64 %sub, 2 + %sub.i = add nsw i64 %sub.ptr.div.i, -1 + %div.i = sdiv i64 %sub.i, 2 + %and.i = and i64 %sub.ptr.div.i, 1 + %cmp18.i = icmp eq i64 %and.i, 0 + br i1 %cmp18.i, label %while.cond.us.preheader, label %while.cond.preheader + +while.cond.preheader: ; preds = %if.end + br label %while.cond + +while.cond.us.preheader: ; preds = %if.end + %add23.i.us = shl nsw i64 %div, 1 + %sub25.i.us = or i64 %add23.i.us, 1 + %add.ptr.i72.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub25.i.us + %add.ptr.i69.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %div + %2 = bitcast %struct.ClassProb* %add.ptr.i72.i.us to i64* + %3 = bitcast %struct.ClassProb* %add.ptr.i69.i.us to i64* + br label %while.cond.us + +while.cond.us: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us, %while.cond.us.preheader + %__parent.0.us = phi i64 [ %dec.us, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us ], [ %div, %while.cond.us.preheader ] + %add.ptr.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.us + %4 = bitcast %struct.ClassProb* %add.ptr.i.us to i64* + %5 = load i64, i64* %4, align 4 + %cmp84.i.us = icmp sgt i64 %div.i, %__parent.0.us + br i1 %cmp84.i.us, label %while.body.i.us.preheader, label %while.end.i.us + +while.body.i.us.preheader: ; preds = %while.cond.us + br label %while.body.i.us + +while.body.i.us: ; preds = %while.body.i.us, %while.body.i.us.preheader + %__secondChild.085.i.us = phi i64 [ %dec.mul.i.us, %while.body.i.us ], [ %__parent.0.us, %while.body.i.us.preheader ] + %add.i.us = shl i64 %__secondChild.085.i.us, 1 + %mul.i.us = add i64 %add.i.us, 2 + %add.ptr.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %mul.i.us + %sub4.i.us = or i64 %add.i.us, 1 + %add.ptr.i66.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub4.i.us + %agg.tmp.sroa.0.0..sroa_cast.i.i.us = bitcast %struct.ClassProb* %add.ptr.i.i.us to i64* + %agg.tmp.sroa.0.0.copyload.i.i.us = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.us, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i.us = bitcast %struct.ClassProb* %add.ptr.i66.i.us to i64* + %agg.tmp3.sroa.0.0.copyload.i.i.us = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i.us, align 4 + %call5.i.i.us = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.us, i64 %agg.tmp3.sroa.0.0.copyload.i.i.us) #2 + %dec.mul.i.us = select i1 %call5.i.i.us, i64 %sub4.i.us, i64 %mul.i.us + %add.ptr.i78.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %dec.mul.i.us + %add.ptr.i75.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.085.i.us + %6 = bitcast %struct.ClassProb* %add.ptr.i78.i.us to i64* + %7 = bitcast %struct.ClassProb* %add.ptr.i75.i.us to i64* + %8 = load i64, i64* %6, align 4 + store i64 %8, i64* %7, align 4 + %cmp.i.us = icmp slt i64 %dec.mul.i.us, %div.i + br i1 %cmp.i.us, label %while.body.i.us, label %while.end.i.us.loopexit + +while.end.i.us.loopexit: ; preds = %while.body.i.us + br label %while.end.i.us + +while.end.i.us: ; preds = %while.end.i.us.loopexit, %while.cond.us + %__secondChild.0.lcssa.i.us = phi i64 [ %__parent.0.us, %while.cond.us ], [ %dec.mul.i.us, %while.end.i.us.loopexit ] + %cmp21.i.us = icmp eq i64 %__secondChild.0.lcssa.i.us, %div + br i1 %cmp21.i.us, label %if.then22.i.us, label %if.end36.i.us + +if.then22.i.us: ; preds = %while.end.i.us + %9 = load i64, i64* %2, align 4 + store i64 %9, i64* %3, align 4 + br label %if.end36.i.us + +if.end36.i.us: ; preds = %if.then22.i.us, %while.end.i.us + %__holeIndex.addr.1.i.us = phi i64 [ %sub25.i.us, %if.then22.i.us ], [ %__secondChild.0.lcssa.i.us, %while.end.i.us ] + %cmp42.i.i.us = icmp sgt i64 %__holeIndex.addr.1.i.us, %__parent.0.us + br i1 %cmp42.i.i.us, label %land.rhs.i.i.us.preheader, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us + +land.rhs.i.i.us.preheader: ; preds = %if.end36.i.us + br label %land.rhs.i.i.us + +land.rhs.i.i.us: ; preds = %while.body.i.i.us, %land.rhs.i.i.us.preheader + %__parent.044.in.in.i.i.us = phi i64 [ %__parent.044.i.i.us, %while.body.i.i.us ], [ %__holeIndex.addr.1.i.us, %land.rhs.i.i.us.preheader ] + %__parent.044.in.i.i.us = add nsw i64 %__parent.044.in.in.i.i.us, -1 + %__parent.044.i.i.us = sdiv i64 %__parent.044.in.i.i.us, 2 + %add.ptr.i.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.i.i.us + %agg.tmp.sroa.0.0..sroa_cast.i.i.i.us = bitcast %struct.ClassProb* %add.ptr.i.i.i.us to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i.us = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.us, align 4 + %call3.i.i.i.us = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i.us, i64 %5) #2 + br i1 %call3.i.i.i.us, label %while.body.i.i.us, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit + +while.body.i.i.us: ; preds = %land.rhs.i.i.us + %add.ptr.i32.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.in.in.i.i.us + %10 = bitcast %struct.ClassProb* %add.ptr.i32.i.i.us to i64* + %11 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i.us, align 4 + store i64 %11, i64* %10, align 4 + %cmp.i.i.us = icmp sgt i64 %__parent.044.i.i.us, %__parent.0.us + br i1 %cmp.i.i.us, label %land.rhs.i.i.us, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit: ; preds = %while.body.i.i.us, %land.rhs.i.i.us + %__parent.0.in.in.lcssa.i.i.us.ph = phi i64 [ %__parent.044.i.i.us, %while.body.i.i.us ], [ %__parent.044.in.in.i.i.us, %land.rhs.i.i.us ] + br label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit, %if.end36.i.us + %__parent.0.in.in.lcssa.i.i.us = phi i64 [ %__holeIndex.addr.1.i.us, %if.end36.i.us ], [ %__parent.0.in.in.lcssa.i.i.us.ph, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us.loopexit ] + %add.ptr.i29.i.i.us = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.in.in.lcssa.i.i.us + %12 = bitcast %struct.ClassProb* %add.ptr.i29.i.i.us to i64* + store i64 %5, i64* %12, align 4 + %cmp13.us = icmp eq i64 %__parent.0.us, 0 + %dec.us = add nsw i64 %__parent.0.us, -1 + br i1 %cmp13.us, label %return.loopexit, label %while.cond.us + +while.cond: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit, %while.cond.preheader + %__parent.0 = phi i64 [ %dec, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit ], [ %div, %while.cond.preheader ] + %add.ptr.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0 + %13 = bitcast %struct.ClassProb* %add.ptr.i to i64* + %14 = load i64, i64* %13, align 4 + %cmp84.i = icmp sgt i64 %div.i, %__parent.0 + br i1 %cmp84.i, label %while.body.i.preheader, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + +while.body.i.preheader: ; preds = %while.cond + br label %while.body.i + +while.body.i: ; preds = %while.body.i, %while.body.i.preheader + %__secondChild.085.i = phi i64 [ %dec.mul.i, %while.body.i ], [ %__parent.0, %while.body.i.preheader ] + %add.i = shl i64 %__secondChild.085.i, 1 + %mul.i = add i64 %add.i, 2 + %add.ptr.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %mul.i + %sub4.i = or i64 %add.i, 1 + %add.ptr.i66.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %sub4.i + %agg.tmp.sroa.0.0..sroa_cast.i.i = bitcast %struct.ClassProb* %add.ptr.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i, align 4 + %agg.tmp3.sroa.0.0..sroa_cast.i.i = bitcast %struct.ClassProb* %add.ptr.i66.i to i64* + %agg.tmp3.sroa.0.0.copyload.i.i = load i64, i64* %agg.tmp3.sroa.0.0..sroa_cast.i.i, align 4 + %call5.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i, i64 %agg.tmp3.sroa.0.0.copyload.i.i) #2 + %dec.mul.i = select i1 %call5.i.i, i64 %sub4.i, i64 %mul.i + %add.ptr.i78.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %dec.mul.i + %add.ptr.i75.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__secondChild.085.i + %15 = bitcast %struct.ClassProb* %add.ptr.i78.i to i64* + %16 = bitcast %struct.ClassProb* %add.ptr.i75.i to i64* + %17 = load i64, i64* %15, align 4 + store i64 %17, i64* %16, align 4 + %cmp.i = icmp slt i64 %dec.mul.i, %div.i + br i1 %cmp.i, label %while.body.i, label %if.end36.i + +if.end36.i: ; preds = %while.body.i + %cmp42.i.i = icmp sgt i64 %dec.mul.i, %__parent.0 + br i1 %cmp42.i.i, label %land.rhs.i.i.preheader, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + +land.rhs.i.i.preheader: ; preds = %if.end36.i + br label %land.rhs.i.i + +land.rhs.i.i: ; preds = %while.body.i.i, %land.rhs.i.i.preheader + %__parent.044.in.in.i.i = phi i64 [ %__parent.044.i.i, %while.body.i.i ], [ %dec.mul.i, %land.rhs.i.i.preheader ] + %__parent.044.in.i.i = add nsw i64 %__parent.044.in.in.i.i, -1 + %__parent.044.i.i = sdiv i64 %__parent.044.in.i.i, 2 + %add.ptr.i.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.i.i + %agg.tmp.sroa.0.0..sroa_cast.i.i.i = bitcast %struct.ClassProb* %add.ptr.i.i.i to i64* + %agg.tmp.sroa.0.0.copyload.i.i.i = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + %call3.i.i.i = tail call zeroext i1 %__comp.coerce(i64 %agg.tmp.sroa.0.0.copyload.i.i.i, i64 %14) #2 + br i1 %call3.i.i.i, label %while.body.i.i, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit + +while.body.i.i: ; preds = %land.rhs.i.i + %add.ptr.i32.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.044.in.in.i.i + %18 = bitcast %struct.ClassProb* %add.ptr.i32.i.i to i64* + %19 = load i64, i64* %agg.tmp.sroa.0.0..sroa_cast.i.i.i, align 4 + store i64 %19, i64* %18, align 4 + %cmp.i.i = icmp sgt i64 %__parent.044.i.i, %__parent.0 + br i1 %cmp.i.i, label %land.rhs.i.i, label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit: ; preds = %while.body.i.i, %land.rhs.i.i + %__parent.0.in.in.lcssa.i.i.ph = phi i64 [ %__parent.044.i.i, %while.body.i.i ], [ %__parent.044.in.in.i.i, %land.rhs.i.i ] + br label %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + +_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit, %if.end36.i, %while.cond + %__parent.0.in.in.lcssa.i.i = phi i64 [ %dec.mul.i, %if.end36.i ], [ %__parent.0, %while.cond ], [ %__parent.0.in.in.lcssa.i.i.ph, %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.loopexit ] + %add.ptr.i29.i.i = getelementptr inbounds %struct.ClassProb, %struct.ClassProb* %__first.coerce, i64 %__parent.0.in.in.lcssa.i.i + %20 = bitcast %struct.ClassProb* %add.ptr.i29.i.i to i64* + store i64 %14, i64* %20, align 4 + %cmp13 = icmp eq i64 %__parent.0, 0 + %dec = add nsw i64 %__parent.0, -1 + br i1 %cmp13, label %return.loopexit34, label %while.cond + +return.loopexit: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit.us + br label %return + +return.loopexit34: ; preds = %_ZSt13__adjust_heapIN9__gnu_cxx17__normal_iteratorIP9ClassProbSt6vectorIS2_SaIS2_EEEElS2_NS0_5__ops15_Iter_comp_iterIPFbS2_S2_EEEEvT_T0_SE_T1_T2_.exit + br label %return + +return: ; preds = %return.loopexit34, %return.loopexit, %entry + ret void +} + +; Function Attrs: nounwind readnone +declare i64 @llvm.ctlz.i64(i64, i1) #12 + +declare dereferenceable(32) %"class.std::__cxx11::basic_string"* @_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE10_M_replaceEmmPKcm(%"class.std::__cxx11::basic_string"*, i64, i64, i8*, i64) local_unnamed_addr #0 + +; Function Attrs: nounwind uwtable +define internal void @_GLOBAL__sub_I_vgg16_cifar10.cpp() #3 section ".text.startup" { +entry: + tail call void @_ZNSt8ios_base4InitC1Ev(%"class.std::ios_base::Init"* nonnull @_ZStL8__ioinit) #2 + %0 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::ios_base::Init"*)* @_ZNSt8ios_base4InitD1Ev to void (i8*)*), i8* getelementptr inbounds (%"class.std::ios_base::Init", %"class.std::ios_base::Init"* @_ZStL8__ioinit, i64 0, i32 0), i8* nonnull @__dso_handle) #2 + tail call void @llvm.memset.p0i8.i64(i8* bitcast (%"class.std::vector"* @run_accuracies to i8*), i8 0, i64 24, i32 8, i1 false) #2 + %1 = tail call i32 @__cxa_atexit(void (i8*)* bitcast (void (%"class.std::vector"*)* @_ZNSt6vectorIfSaIfEED2Ev to void (i8*)*), i8* bitcast (%"class.std::vector"* @run_accuracies to i8*), i8* nonnull @__dso_handle) #2 + ret void +} ; Function Attrs: nounwind -declare i32 @puts(i8* nocapture readonly) #7 +declare i32 @puts(i8* nocapture readonly) #2 declare i32 @putchar(i32) +; Function Attrs: nounwind +declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2 + ; Function Attrs: argmemonly nounwind -declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #1 +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) #4 ; Function Attrs: nounwind -declare i8* @llvm.visc.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #7 +declare i8* @llvm.visc.tensor.convolution(i8*, i8*, i32, i32, i32, i32) #2 ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_0_nodePvmS_m @_Z10var_0_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z10var_0_nodePvmS_m @_Z10var_0_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z10var_0_nodePvmS_m undef, i8* %call1, 0 @@ -4689,10 +7872,10 @@ entry: } ; Function Attrs: nounwind -declare i8* @llvm.visc.tensor.add(i8*, i8*) #7 +declare i8* @llvm.visc.tensor.add(i8*, i8*) #2 ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_1_nodePvmS_m @_Z10var_1_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z10var_1_nodePvmS_m @_Z10var_1_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z10var_1_nodePvmS_m undef, i8* %call1, 0 @@ -4701,10 +7884,10 @@ entry: } ; Function Attrs: nounwind -declare i8* @llvm.visc.tensor.relu(i8*) #7 +declare i8* @llvm.visc.tensor.relu(i8*) #2 ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_2_nodePvm @_Z10var_2_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z10var_2_nodePvm @_Z10var_2_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z10var_2_nodePvm undef, i8* %call1, 0 @@ -4713,7 +7896,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_3_nodePvmS_m @_Z10var_3_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z10var_3_nodePvmS_m @_Z10var_3_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z10var_3_nodePvmS_m undef, i8* %call1, 0 @@ -4722,7 +7905,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_4_nodePvmS_m @_Z10var_4_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z10var_4_nodePvmS_m @_Z10var_4_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z10var_4_nodePvmS_m undef, i8* %call1, 0 @@ -4731,7 +7914,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_5_nodePvm @_Z10var_5_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z10var_5_nodePvm @_Z10var_5_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z10var_5_nodePvm undef, i8* %call1, 0 @@ -4740,10 +7923,10 @@ entry: } ; Function Attrs: nounwind -declare i8* @llvm.visc.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #7 +declare i8* @llvm.visc.tensor.pool.max(i8*, i32, i32, i32, i32, i32, i32) #2 ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_6_nodePvm @_Z10var_6_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z10var_6_nodePvm @_Z10var_6_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) %returnStruct = insertvalue %struct.out._Z10var_6_nodePvm undef, i8* %call1, 0 @@ -4752,7 +7935,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_7_nodePvmS_m @_Z10var_7_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z10var_7_nodePvmS_m @_Z10var_7_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z10var_7_nodePvmS_m undef, i8* %call1, 0 @@ -4761,7 +7944,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_8_nodePvmS_m @_Z10var_8_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z10var_8_nodePvmS_m @_Z10var_8_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z10var_8_nodePvmS_m undef, i8* %call1, 0 @@ -4770,7 +7953,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z10var_9_nodePvm @_Z10var_9_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z10var_9_nodePvm @_Z10var_9_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z10var_9_nodePvm undef, i8* %call1, 0 @@ -4779,7 +7962,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_10_nodePvmS_m @_Z11var_10_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_10_nodePvmS_m @_Z11var_10_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_10_nodePvmS_m undef, i8* %call1, 0 @@ -4788,7 +7971,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_11_nodePvmS_m @_Z11var_11_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_11_nodePvmS_m @_Z11var_11_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_11_nodePvmS_m undef, i8* %call1, 0 @@ -4797,7 +7980,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_12_nodePvm @_Z11var_12_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_12_nodePvm @_Z11var_12_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_12_nodePvm undef, i8* %call1, 0 @@ -4806,7 +7989,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_13_nodePvm @_Z11var_13_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_13_nodePvm @_Z11var_13_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) %returnStruct = insertvalue %struct.out._Z11var_13_nodePvm undef, i8* %call1, 0 @@ -4815,7 +7998,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_14_nodePvmS_m @_Z11var_14_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_14_nodePvmS_m @_Z11var_14_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_14_nodePvmS_m undef, i8* %call1, 0 @@ -4824,7 +8007,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_15_nodePvmS_m @_Z11var_15_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_15_nodePvmS_m @_Z11var_15_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_15_nodePvmS_m undef, i8* %call1, 0 @@ -4833,7 +8016,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_16_nodePvm @_Z11var_16_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_16_nodePvm @_Z11var_16_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_16_nodePvm undef, i8* %call1, 0 @@ -4842,7 +8025,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_17_nodePvmS_m @_Z11var_17_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_17_nodePvmS_m @_Z11var_17_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_17_nodePvmS_m undef, i8* %call1, 0 @@ -4851,7 +8034,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_18_nodePvmS_m @_Z11var_18_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_18_nodePvmS_m @_Z11var_18_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_18_nodePvmS_m undef, i8* %call1, 0 @@ -4860,7 +8043,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_19_nodePvm @_Z11var_19_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_19_nodePvm @_Z11var_19_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_19_nodePvm undef, i8* %call1, 0 @@ -4869,7 +8052,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_20_nodePvmS_m @_Z11var_20_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_20_nodePvmS_m @_Z11var_20_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_20_nodePvmS_m undef, i8* %call1, 0 @@ -4878,7 +8061,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_21_nodePvmS_m @_Z11var_21_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_21_nodePvmS_m @_Z11var_21_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_21_nodePvmS_m undef, i8* %call1, 0 @@ -4887,7 +8070,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_22_nodePvm @_Z11var_22_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_22_nodePvm @_Z11var_22_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_22_nodePvm undef, i8* %call1, 0 @@ -4896,7 +8079,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_23_nodePvm @_Z11var_23_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_23_nodePvm @_Z11var_23_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) %returnStruct = insertvalue %struct.out._Z11var_23_nodePvm undef, i8* %call1, 0 @@ -4905,7 +8088,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_24_nodePvmS_m @_Z11var_24_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_24_nodePvmS_m @_Z11var_24_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_24_nodePvmS_m undef, i8* %call1, 0 @@ -4914,7 +8097,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_25_nodePvmS_m @_Z11var_25_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_25_nodePvmS_m @_Z11var_25_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_25_nodePvmS_m undef, i8* %call1, 0 @@ -4923,7 +8106,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_26_nodePvm @_Z11var_26_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_26_nodePvm @_Z11var_26_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_26_nodePvm undef, i8* %call1, 0 @@ -4932,7 +8115,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_27_nodePvmS_m @_Z11var_27_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_27_nodePvmS_m @_Z11var_27_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_27_nodePvmS_m undef, i8* %call1, 0 @@ -4941,7 +8124,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_28_nodePvmS_m @_Z11var_28_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_28_nodePvmS_m @_Z11var_28_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_28_nodePvmS_m undef, i8* %call1, 0 @@ -4950,7 +8133,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_29_nodePvm @_Z11var_29_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_29_nodePvm @_Z11var_29_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_29_nodePvm undef, i8* %call1, 0 @@ -4959,7 +8142,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_30_nodePvmS_m @_Z11var_30_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_30_nodePvmS_m @_Z11var_30_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_30_nodePvmS_m undef, i8* %call1, 0 @@ -4968,7 +8151,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_31_nodePvmS_m @_Z11var_31_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_31_nodePvmS_m @_Z11var_31_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_31_nodePvmS_m undef, i8* %call1, 0 @@ -4977,7 +8160,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_32_nodePvm @_Z11var_32_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_32_nodePvm @_Z11var_32_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_32_nodePvm undef, i8* %call1, 0 @@ -4986,7 +8169,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_33_nodePvm @_Z11var_33_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_33_nodePvm @_Z11var_33_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) %returnStruct = insertvalue %struct.out._Z11var_33_nodePvm undef, i8* %call1, 0 @@ -4995,7 +8178,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_34_nodePvmS_m @_Z11var_34_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_34_nodePvmS_m @_Z11var_34_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_34_nodePvmS_m undef, i8* %call1, 0 @@ -5004,7 +8187,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_35_nodePvmS_m @_Z11var_35_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_35_nodePvmS_m @_Z11var_35_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_35_nodePvmS_m undef, i8* %call1, 0 @@ -5013,7 +8196,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_36_nodePvm @_Z11var_36_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_36_nodePvm @_Z11var_36_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_36_nodePvm undef, i8* %call1, 0 @@ -5022,7 +8205,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_37_nodePvmS_m @_Z11var_37_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_37_nodePvmS_m @_Z11var_37_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_37_nodePvmS_m undef, i8* %call1, 0 @@ -5031,7 +8214,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_38_nodePvmS_m @_Z11var_38_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_38_nodePvmS_m @_Z11var_38_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_38_nodePvmS_m undef, i8* %call1, 0 @@ -5040,7 +8223,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_39_nodePvm @_Z11var_39_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_39_nodePvm @_Z11var_39_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_39_nodePvm undef, i8* %call1, 0 @@ -5049,7 +8232,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_40_nodePvmS_m @_Z11var_40_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_40_nodePvmS_m @_Z11var_40_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.convolution(i8* %t1, i8* %t2, i32 1, i32 1, i32 1, i32 1) %returnStruct = insertvalue %struct.out._Z11var_40_nodePvmS_m undef, i8* %call1, 0 @@ -5058,7 +8241,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_41_nodePvmS_m @_Z11var_41_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_41_nodePvmS_m @_Z11var_41_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_41_nodePvmS_m undef, i8* %call1, 0 @@ -5067,7 +8250,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_42_nodePvm @_Z11var_42_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_42_nodePvm @_Z11var_42_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_42_nodePvm undef, i8* %call1, 0 @@ -5076,7 +8259,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_43_nodePvm @_Z11var_43_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_43_nodePvm @_Z11var_43_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.pool.max(i8* %t1, i32 2, i32 2, i32 0, i32 0, i32 2, i32 2) %returnStruct = insertvalue %struct.out._Z11var_43_nodePvm undef, i8* %call1, 0 @@ -5085,10 +8268,10 @@ entry: } ; Function Attrs: nounwind -declare i8* @llvm.visc.tensor.mul(i8*, i8*) #7 +declare i8* @llvm.visc.tensor.mul(i8*, i8*) #2 ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_44_nodePvmS_m @_Z11var_44_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_44_nodePvmS_m @_Z11var_44_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.mul(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_44_nodePvmS_m undef, i8* %call1, 0 @@ -5097,7 +8280,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_45_nodePvmS_m @_Z11var_45_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_45_nodePvmS_m @_Z11var_45_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_45_nodePvmS_m undef, i8* %call1, 0 @@ -5106,7 +8289,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_46_nodePvm @_Z11var_46_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_46_nodePvm @_Z11var_46_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.relu(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_46_nodePvm undef, i8* %call1, 0 @@ -5115,7 +8298,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_47_nodePvmS_m @_Z11var_47_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_47_nodePvmS_m @_Z11var_47_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.mul(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_47_nodePvmS_m undef, i8* %call1, 0 @@ -5124,7 +8307,7 @@ entry: } ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_48_nodePvmS_m @_Z11var_48_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #0 { +define %struct.out._Z11var_48_nodePvmS_m @_Z11var_48_nodePvmS_m_cloned(i8* in %t1, i64 %bytes_t1, i8* in %t2, i64 %bytes_t2) #3 { entry: %call1 = call i8* @llvm.visc.tensor.add(i8* %t1, i8* %t2) %returnStruct = insertvalue %struct.out._Z11var_48_nodePvmS_m undef, i8* %call1, 0 @@ -5133,10 +8316,10 @@ entry: } ; Function Attrs: nounwind -declare i8* @llvm.visc.tensor.softmax(i8*) #7 +declare i8* @llvm.visc.tensor.softmax(i8*) #2 ; Function Attrs: nounwind uwtable -define %struct.out._Z11var_49_nodePvm @_Z11var_49_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #0 { +define %struct.out._Z11var_49_nodePvm @_Z11var_49_nodePvm_cloned(i8* in %t1, i64 %bytes_t1) #3 { entry: %call1 = call i8* @llvm.visc.tensor.softmax(i8* %t1) %returnStruct = insertvalue %struct.out._Z11var_49_nodePvm undef, i8* %call1, 0 @@ -5145,19 +8328,19 @@ entry: } ; Function Attrs: nounwind -declare i8* @llvm.visc.createNode(i8*) #7 +declare i8* @llvm.visc.createNode(i8*) #2 ; Function Attrs: nounwind -declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #7 +declare void @llvm.visc.bind.input(i8*, i32, i32, i1) #2 ; Function Attrs: nounwind -declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #7 +declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32, i1) #2 ; Function Attrs: nounwind -declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #7 +declare void @llvm.visc.bind.output(i8*, i32, i32, i1) #2 ; Function Attrs: nounwind uwtable -define %struct.out._Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* in %conv2d_1_b, i64 %conv2d_1_b_bytes, i8* in %conv2d_2_w, i64 %conv2d_2_w_bytes, i8* in %conv2d_2_b, i64 %conv2d_2_b_bytes, i8* in %conv2d_3_w, i64 %conv2d_3_w_bytes, i8* in %conv2d_3_b, i64 %conv2d_3_b_bytes, i8* in %conv2d_4_w, i64 %conv2d_4_w_bytes, i8* in %conv2d_4_b, i64 %conv2d_4_b_bytes, i8* in %conv2d_5_w, i64 %conv2d_5_w_bytes, i8* in %conv2d_5_b, i64 %conv2d_5_b_bytes, i8* in %conv2d_6_w, i64 %conv2d_6_w_bytes, i8* in %conv2d_6_b, i64 %conv2d_6_b_bytes, i8* in %conv2d_7_w, i64 %conv2d_7_w_bytes, i8* in %conv2d_7_b, i64 %conv2d_7_b_bytes, i8* in %conv2d_8_w, i64 %conv2d_8_w_bytes, i8* in %conv2d_8_b, i64 %conv2d_8_b_bytes, i8* in %conv2d_9_w, i64 %conv2d_9_w_bytes, i8* in %conv2d_9_b, i64 %conv2d_9_b_bytes, i8* in %conv2d_10_w, i64 %conv2d_10_w_bytes, i8* in %conv2d_10_b, i64 %conv2d_10_b_bytes, i8* in %conv2d_11_w, i64 %conv2d_11_w_bytes, i8* in %conv2d_11_b, i64 %conv2d_11_b_bytes, i8* in %conv2d_12_w, i64 %conv2d_12_w_bytes, i8* in %conv2d_12_b, i64 %conv2d_12_b_bytes, i8* in %conv2d_13_w, i64 %conv2d_13_w_bytes, i8* in %conv2d_13_b, i64 %conv2d_13_b_bytes, i8* in %dense_1_w, i64 %dense_1_w_bytes, i8* in %dense_1_b, i64 %dense_1_b_bytes, i8* in %dense_2_w, i64 %dense_2_w_bytes, i8* in %dense_2_b, i64 %dense_2_b_bytes) #0 { +define %struct.out._Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m_cloned(i8* in %input, i64 %input_bytes, i8* in %conv2d_1_w, i64 %conv2d_1_w_bytes, i8* in %conv2d_1_b, i64 %conv2d_1_b_bytes, i8* in %conv2d_2_w, i64 %conv2d_2_w_bytes, i8* in %conv2d_2_b, i64 %conv2d_2_b_bytes, i8* in %conv2d_3_w, i64 %conv2d_3_w_bytes, i8* in %conv2d_3_b, i64 %conv2d_3_b_bytes, i8* in %conv2d_4_w, i64 %conv2d_4_w_bytes, i8* in %conv2d_4_b, i64 %conv2d_4_b_bytes, i8* in %conv2d_5_w, i64 %conv2d_5_w_bytes, i8* in %conv2d_5_b, i64 %conv2d_5_b_bytes, i8* in %conv2d_6_w, i64 %conv2d_6_w_bytes, i8* in %conv2d_6_b, i64 %conv2d_6_b_bytes, i8* in %conv2d_7_w, i64 %conv2d_7_w_bytes, i8* in %conv2d_7_b, i64 %conv2d_7_b_bytes, i8* in %conv2d_8_w, i64 %conv2d_8_w_bytes, i8* in %conv2d_8_b, i64 %conv2d_8_b_bytes, i8* in %conv2d_9_w, i64 %conv2d_9_w_bytes, i8* in %conv2d_9_b, i64 %conv2d_9_b_bytes, i8* in %conv2d_10_w, i64 %conv2d_10_w_bytes, i8* in %conv2d_10_b, i64 %conv2d_10_b_bytes, i8* in %conv2d_11_w, i64 %conv2d_11_w_bytes, i8* in %conv2d_11_b, i64 %conv2d_11_b_bytes, i8* in %conv2d_12_w, i64 %conv2d_12_w_bytes, i8* in %conv2d_12_b, i64 %conv2d_12_b_bytes, i8* in %conv2d_13_w, i64 %conv2d_13_w_bytes, i8* in %conv2d_13_b, i64 %conv2d_13_b_bytes, i8* in %dense_1_w, i64 %dense_1_w_bytes, i8* in %dense_1_b, i64 %dense_1_b_bytes, i8* in %dense_2_w, i64 %dense_2_w_bytes, i8* in %dense_2_b, i64 %dense_2_b_bytes) #3 { entry: %_Z10var_0_nodePvmS_m_cloned.node = call i8* @llvm.visc.createNode(i8* bitcast (%struct.out._Z10var_0_nodePvmS_m (i8*, i64, i8*, i64)* @_Z10var_0_nodePvmS_m_cloned to i8*)) call void @llvm.visc.bind.input(i8* %_Z10var_0_nodePvmS_m_cloned.node, i32 0, i32 0, i1 false) @@ -5375,26 +8558,31 @@ entry: } ; Function Attrs: nounwind -declare void @llvm.visc.init() #7 +declare void @llvm.visc.init() #2 ; Function Attrs: nounwind -declare i8* @llvm.visc.launch(i8*, i8*, i1) #7 +declare i8* @llvm.visc.launch(i8*, i8*, i1) #2 ; Function Attrs: nounwind -declare void @llvm.visc.wait(i8*) #7 +declare void @llvm.visc.wait(i8*) #2 ; Function Attrs: nounwind -declare void @llvm.visc.cleanup() #7 - -attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #4 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #5 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #6 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #7 = { nounwind } -attributes #8 = { noreturn nounwind } +declare void @llvm.visc.cleanup() #2 + +attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #2 = { nounwind } +attributes #3 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #4 = { argmemonly nounwind } +attributes #5 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #6 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #7 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #8 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #10 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #11 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #12 = { nounwind readnone } +attributes #13 = { noreturn nounwind } !llvm.ident = !{!0} !visc_hint_cudnn = !{!1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50} @@ -5457,85 +8645,85 @@ attributes #8 = { noreturn nounwind } !49 = !{%struct.out._Z11var_48_nodePvmS_m (i8*, i64, i8*, i64)* @_Z11var_48_nodePvmS_m_cloned} !50 = !{%struct.out._Z11var_49_nodePvm (i8*, i64)* @_Z11var_49_nodePvm_cloned} !51 = !{%struct.out._Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m (i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64, i8*, i64)* @_Z4rootPvmS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_mS_m_cloned} -!52 = !{!53, !58, i64 40} -!53 = !{!"_ZTS6Tensor", !54, i64 0, !54, i64 4, !57, i64 8, !58, i64 16, !58, i64 24, !58, i64 32, !58, i64 40, !59, i64 48, !59, i64 56, !60, i64 64} -!54 = !{!"int", !55, i64 0} -!55 = !{!"omnipotent char", !56, i64 0} -!56 = !{!"Simple C++ TBAA"} -!57 = !{!"_ZTS15data_location_t", !55, i64 0} -!58 = !{!"any pointer", !55, i64 0} -!59 = !{!"long", !55, i64 0} -!60 = !{!"_ZTS9Dimension", !54, i64 0, !58, i64 8} -!61 = !{!53, !54, i64 64} -!62 = !{!53, !58, i64 72} -!63 = !{!59, !59, i64 0} -!64 = !{!53, !59, i64 48} -!65 = !{!53, !59, i64 56} -!66 = !{!53, !58, i64 32} -!67 = !{!53, !54, i64 0} -!68 = !{!69, !69, i64 0} -!69 = !{!"float", !55, i64 0} -!70 = distinct !{!70, !71} -!71 = !{!"llvm.loop.unroll.disable"} -!72 = distinct !{!72, !73, !74} -!73 = !{!"llvm.loop.vectorize.width", i32 1} -!74 = !{!"llvm.loop.interleave.count", i32 1} -!75 = distinct !{!75, !73, !74} -!76 = distinct !{!76, !71} -!77 = distinct !{!77, !73, !74} -!78 = distinct !{!78, !71} -!79 = distinct !{!79, !73, !74} -!80 = distinct !{!80, !73, !74} -!81 = distinct !{!81, !73, !74} -!82 = distinct !{!82, !71} -!83 = distinct !{!83, !73, !74} -!84 = distinct !{!84, !73, !74} -!85 = distinct !{!85, !73, !74} -!86 = distinct !{!86, !73, !74} -!87 = !{!55, !55, i64 0} -!88 = distinct !{!88, !73, !74} -!89 = distinct !{!89, !90, !73, !74} -!90 = !{!"llvm.loop.unroll.runtime.disable"} -!91 = !{!92, !92, i64 0} -!92 = !{!"vtable pointer", !56, i64 0} -!93 = !{!94, !58, i64 216} -!94 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !58, i64 216, !55, i64 224, !95, i64 225, !58, i64 232, !58, i64 240, !58, i64 248, !58, i64 256} -!95 = !{!"bool", !55, i64 0} -!96 = !{!94, !55, i64 224} -!97 = !{!94, !95, i64 225} -!98 = !{!99, !100, i64 64} -!99 = !{!"_ZTSNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE", !100, i64 64, !101, i64 72} -!100 = !{!"_ZTSSt13_Ios_Openmode", !55, i64 0} -!101 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE", !102, i64 0, !59, i64 8, !55, i64 16} -!102 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderE", !58, i64 0} -!103 = !{!102, !58, i64 0} -!104 = !{!101, !59, i64 8} -!105 = !{!106, !106, i64 0} -!106 = !{!"_ZTSSt13_Ios_Fmtflags", !55, i64 0} -!107 = !{!101, !58, i64 0} -!108 = !{!109, !106, i64 24} -!109 = !{!"_ZTSSt8ios_base", !59, i64 8, !59, i64 16, !106, i64 24, !110, i64 28, !110, i64 32, !58, i64 40, !111, i64 48, !55, i64 64, !54, i64 192, !58, i64 200, !112, i64 208} -!110 = !{!"_ZTSSt12_Ios_Iostate", !55, i64 0} -!111 = !{!"_ZTSNSt8ios_base6_WordsE", !58, i64 0, !59, i64 8} -!112 = !{!"_ZTSSt6locale", !58, i64 0} -!113 = !{!114} -!114 = distinct !{!114, !115, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!115 = distinct !{!115, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!116 = !{!117} -!117 = distinct !{!117, !118, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!118 = distinct !{!118, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!119 = !{!120} -!120 = distinct !{!120, !121, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!121 = distinct !{!121, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!122 = !{!123} -!123 = distinct !{!123, !124, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!124 = distinct !{!124, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!125 = !{!126} -!126 = distinct !{!126, !127, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!127 = distinct !{!127, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!128 = !{!129} -!129 = distinct !{!129, !130, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} -!130 = distinct !{!130, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!52 = !{!53, !55, i64 0} +!53 = !{!"_ZTSSt12_Vector_baseIfSaIfEE", !54, i64 0} +!54 = !{!"_ZTSNSt12_Vector_baseIfSaIfEE12_Vector_implE", !55, i64 0, !55, i64 8, !55, i64 16} +!55 = !{!"any pointer", !56, i64 0} +!56 = !{!"omnipotent char", !57, i64 0} +!57 = !{!"Simple C++ TBAA"} +!58 = !{!59, !55, i64 56} +!59 = !{!"_ZTS6Tensor", !60, i64 0, !60, i64 4, !60, i64 8, !61, i64 12, !55, i64 16, !55, i64 24, !55, i64 32, !55, i64 40, !55, i64 48, !55, i64 56, !55, i64 64, !62, i64 72, !62, i64 80, !63, i64 88} +!60 = !{!"int", !56, i64 0} +!61 = !{!"_ZTS15data_location_t", !56, i64 0} +!62 = !{!"long", !56, i64 0} +!63 = !{!"_ZTS9Dimension", !60, i64 0, !55, i64 8} +!64 = !{!59, !60, i64 88} +!65 = !{!59, !55, i64 96} +!66 = !{!62, !62, i64 0} +!67 = !{!59, !62, i64 72} +!68 = !{!59, !55, i64 48} +!69 = !{!59, !62, i64 80} +!70 = !{!59, !60, i64 0} +!71 = !{!72, !72, i64 0} +!72 = !{!"float", !56, i64 0} +!73 = distinct !{!73, !74} +!74 = !{!"llvm.loop.unroll.disable"} +!75 = distinct !{!75, !76, !77} +!76 = !{!"llvm.loop.vectorize.width", i32 1} +!77 = !{!"llvm.loop.interleave.count", i32 1} +!78 = distinct !{!78, !76, !77} +!79 = distinct !{!79, !74} +!80 = distinct !{!80, !76, !77} +!81 = distinct !{!81, !74} +!82 = distinct !{!82, !76, !77} +!83 = distinct !{!83, !76, !77} +!84 = distinct !{!84, !76, !77} +!85 = distinct !{!85, !74} +!86 = distinct !{!86, !76, !77} +!87 = distinct !{!87, !76, !77} +!88 = distinct !{!88, !74} +!89 = distinct !{!89, !76, !77} +!90 = distinct !{!90, !76, !77} +!91 = distinct !{!91, !76, !77} +!92 = distinct !{!92, !76, !77} +!93 = !{!56, !56, i64 0} +!94 = distinct !{!94, !76, !77} +!95 = distinct !{!95, !96, !76, !77} +!96 = !{!"llvm.loop.unroll.runtime.disable"} +!97 = !{!98, !98, i64 0} +!98 = !{!"vtable pointer", !57, i64 0} +!99 = !{!100, !55, i64 216} +!100 = !{!"_ZTSSt9basic_iosIcSt11char_traitsIcEE", !55, i64 216, !56, i64 224, !101, i64 225, !55, i64 232, !55, i64 240, !55, i64 248, !55, i64 256} +!101 = !{!"bool", !56, i64 0} +!102 = !{!100, !56, i64 224} +!103 = !{!100, !101, i64 225} +!104 = !{!105, !106, i64 64} +!105 = !{!"_ZTSNSt7__cxx1115basic_stringbufIcSt11char_traitsIcESaIcEEE", !106, i64 64, !107, i64 72} +!106 = !{!"_ZTSSt13_Ios_Openmode", !56, i64 0} +!107 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE", !108, i64 0, !62, i64 8, !56, i64 16} +!108 = !{!"_ZTSNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_Alloc_hiderE", !55, i64 0} +!109 = !{!108, !55, i64 0} +!110 = !{!107, !62, i64 8} +!111 = !{!112, !112, i64 0} +!112 = !{!"_ZTSSt13_Ios_Fmtflags", !56, i64 0} +!113 = !{!107, !55, i64 0} +!114 = distinct !{!114, !76, !77} +!115 = distinct !{!115, !76, !77} +!116 = !{!117, !112, i64 24} +!117 = !{!"_ZTSSt8ios_base", !62, i64 8, !62, i64 16, !112, i64 24, !118, i64 28, !118, i64 32, !55, i64 40, !119, i64 48, !56, i64 64, !60, i64 192, !55, i64 200, !120, i64 208} +!118 = !{!"_ZTSSt12_Ios_Iostate", !56, i64 0} +!119 = !{!"_ZTSNSt8ios_base6_WordsE", !55, i64 0, !62, i64 8} +!120 = !{!"_ZTSSt6locale", !55, i64 0} +!121 = !{!60, !60, i64 0} +!122 = distinct !{!122, !76, !77} +!123 = distinct !{!123, !96, !76, !77} +!124 = !{!55, !55, i64 0} +!125 = !{!53, !55, i64 8} +!126 = !{!53, !55, i64 16} +!127 = distinct !{!127, !76, !77} +!128 = distinct !{!128, !76, !77} +!129 = distinct !{!129, !96, !76, !77} +!130 = distinct !{!130, !96, !76, !77} !131 = !{!132} !132 = distinct !{!132, !133, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} !133 = distinct !{!133, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} @@ -5614,67 +8802,85 @@ attributes #8 = { noreturn nounwind } !206 = !{!207} !207 = distinct !{!207, !208, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} !208 = distinct !{!208, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} -!209 = !{!210, !58, i64 0} -!210 = !{!"_ZTS6RootIn", !58, i64 0, !59, i64 8, !58, i64 16, !59, i64 24, !58, i64 32, !59, i64 40, !58, i64 48, !59, i64 56, !58, i64 64, !59, i64 72, !58, i64 80, !59, i64 88, !58, i64 96, !59, i64 104, !58, i64 112, !59, i64 120, !58, i64 128, !59, i64 136, !58, i64 144, !59, i64 152, !58, i64 160, !59, i64 168, !58, i64 176, !59, i64 184, !58, i64 192, !59, i64 200, !58, i64 208, !59, i64 216, !58, i64 224, !59, i64 232, !58, i64 240, !59, i64 248, !58, i64 256, !59, i64 264, !58, i64 272, !59, i64 280, !58, i64 288, !59, i64 296, !58, i64 304, !59, i64 312, !58, i64 320, !59, i64 328, !58, i64 336, !59, i64 344, !58, i64 352, !59, i64 360, !58, i64 368, !59, i64 376, !58, i64 384, !59, i64 392, !58, i64 400, !59, i64 408, !58, i64 416, !59, i64 424, !58, i64 432, !59, i64 440, !58, i64 448, !59, i64 456, !58, i64 464, !59, i64 472, !58, i64 480, !59, i64 488, !211, i64 496} -!211 = !{!"_ZTS5ret_t", !58, i64 0, !59, i64 8} -!212 = !{!210, !59, i64 8} -!213 = !{!210, !58, i64 16} -!214 = !{!210, !59, i64 24} -!215 = !{!210, !58, i64 32} -!216 = !{!210, !59, i64 40} -!217 = !{!210, !58, i64 48} -!218 = !{!210, !59, i64 56} -!219 = !{!210, !58, i64 64} -!220 = !{!210, !59, i64 72} -!221 = !{!210, !58, i64 80} -!222 = !{!210, !59, i64 88} -!223 = !{!210, !58, i64 96} -!224 = !{!210, !59, i64 104} -!225 = !{!210, !58, i64 112} -!226 = !{!210, !59, i64 120} -!227 = !{!210, !58, i64 128} -!228 = !{!210, !59, i64 136} -!229 = !{!210, !58, i64 144} -!230 = !{!210, !59, i64 152} -!231 = !{!210, !58, i64 160} -!232 = !{!210, !59, i64 168} -!233 = !{!210, !58, i64 176} -!234 = !{!210, !59, i64 184} -!235 = !{!210, !58, i64 192} -!236 = !{!210, !59, i64 200} -!237 = !{!210, !58, i64 208} -!238 = !{!210, !59, i64 216} -!239 = !{!210, !58, i64 224} -!240 = !{!210, !59, i64 232} -!241 = !{!210, !58, i64 240} -!242 = !{!210, !59, i64 248} -!243 = !{!210, !58, i64 256} -!244 = !{!210, !59, i64 264} -!245 = !{!210, !58, i64 272} -!246 = !{!210, !59, i64 280} -!247 = !{!210, !58, i64 288} -!248 = !{!210, !59, i64 296} -!249 = !{!210, !58, i64 304} -!250 = !{!210, !59, i64 312} -!251 = !{!210, !58, i64 320} -!252 = !{!210, !59, i64 328} -!253 = !{!210, !58, i64 336} -!254 = !{!210, !59, i64 344} -!255 = !{!210, !58, i64 352} -!256 = !{!210, !59, i64 360} -!257 = !{!210, !58, i64 368} -!258 = !{!210, !59, i64 376} -!259 = !{!210, !58, i64 384} -!260 = !{!210, !59, i64 392} -!261 = !{!210, !58, i64 400} -!262 = !{!210, !59, i64 408} -!263 = !{!210, !58, i64 416} -!264 = !{!210, !59, i64 424} -!265 = !{!210, !58, i64 432} -!266 = !{!210, !59, i64 440} -!267 = !{!210, !58, i64 448} -!268 = !{!210, !59, i64 456} -!269 = !{!210, !58, i64 464} -!270 = !{!210, !59, i64 472} -!271 = !{!210, !58, i64 480} -!272 = !{!210, !59, i64 488} +!209 = !{!210} +!210 = distinct !{!210, !211, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!211 = distinct !{!211, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!212 = !{!213} +!213 = distinct !{!213, !214, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!214 = distinct !{!214, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!215 = !{!216} +!216 = distinct !{!216, !217, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!217 = distinct !{!217, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!218 = !{!219} +!219 = distinct !{!219, !220, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!220 = distinct !{!220, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!221 = !{!222} +!222 = distinct !{!222, !223, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!223 = distinct !{!223, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!224 = !{!225} +!225 = distinct !{!225, !226, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_: %agg.result"} +!226 = distinct !{!226, !"_ZStplIcSt11char_traitsIcESaIcEENSt7__cxx1112basic_stringIT_T0_T1_EERKS8_OS8_"} +!227 = !{!228, !55, i64 0} +!228 = !{!"_ZTS6RootIn", !55, i64 0, !62, i64 8, !55, i64 16, !62, i64 24, !55, i64 32, !62, i64 40, !55, i64 48, !62, i64 56, !55, i64 64, !62, i64 72, !55, i64 80, !62, i64 88, !55, i64 96, !62, i64 104, !55, i64 112, !62, i64 120, !55, i64 128, !62, i64 136, !55, i64 144, !62, i64 152, !55, i64 160, !62, i64 168, !55, i64 176, !62, i64 184, !55, i64 192, !62, i64 200, !55, i64 208, !62, i64 216, !55, i64 224, !62, i64 232, !55, i64 240, !62, i64 248, !55, i64 256, !62, i64 264, !55, i64 272, !62, i64 280, !55, i64 288, !62, i64 296, !55, i64 304, !62, i64 312, !55, i64 320, !62, i64 328, !55, i64 336, !62, i64 344, !55, i64 352, !62, i64 360, !55, i64 368, !62, i64 376, !55, i64 384, !62, i64 392, !55, i64 400, !62, i64 408, !55, i64 416, !62, i64 424, !55, i64 432, !62, i64 440, !55, i64 448, !62, i64 456, !55, i64 464, !62, i64 472, !55, i64 480, !62, i64 488, !229, i64 496} +!229 = !{!"_ZTS5ret_t", !55, i64 0, !62, i64 8} +!230 = !{!228, !62, i64 8} +!231 = !{!228, !55, i64 16} +!232 = !{!228, !62, i64 24} +!233 = !{!228, !55, i64 32} +!234 = !{!228, !62, i64 40} +!235 = !{!228, !55, i64 48} +!236 = !{!228, !62, i64 56} +!237 = !{!228, !55, i64 64} +!238 = !{!228, !62, i64 72} +!239 = !{!228, !55, i64 80} +!240 = !{!228, !62, i64 88} +!241 = !{!228, !55, i64 96} +!242 = !{!228, !62, i64 104} +!243 = !{!228, !55, i64 112} +!244 = !{!228, !62, i64 120} +!245 = !{!228, !55, i64 128} +!246 = !{!228, !62, i64 136} +!247 = !{!228, !55, i64 144} +!248 = !{!228, !62, i64 152} +!249 = !{!228, !55, i64 160} +!250 = !{!228, !62, i64 168} +!251 = !{!228, !55, i64 176} +!252 = !{!228, !62, i64 184} +!253 = !{!228, !55, i64 192} +!254 = !{!228, !62, i64 200} +!255 = !{!228, !55, i64 208} +!256 = !{!228, !62, i64 216} +!257 = !{!228, !55, i64 224} +!258 = !{!228, !62, i64 232} +!259 = !{!228, !55, i64 240} +!260 = !{!228, !62, i64 248} +!261 = !{!228, !55, i64 256} +!262 = !{!228, !62, i64 264} +!263 = !{!228, !55, i64 272} +!264 = !{!228, !62, i64 280} +!265 = !{!228, !55, i64 288} +!266 = !{!228, !62, i64 296} +!267 = !{!228, !55, i64 304} +!268 = !{!228, !62, i64 312} +!269 = !{!228, !55, i64 320} +!270 = !{!228, !62, i64 328} +!271 = !{!228, !55, i64 336} +!272 = !{!228, !62, i64 344} +!273 = !{!228, !55, i64 352} +!274 = !{!228, !62, i64 360} +!275 = !{!228, !55, i64 368} +!276 = !{!228, !62, i64 376} +!277 = !{!228, !55, i64 384} +!278 = !{!228, !62, i64 392} +!279 = !{!228, !55, i64 400} +!280 = !{!228, !62, i64 408} +!281 = !{!228, !55, i64 416} +!282 = !{!228, !62, i64 424} +!283 = !{!228, !55, i64 432} +!284 = !{!228, !62, i64 440} +!285 = !{!228, !55, i64 448} +!286 = !{!228, !62, i64 456} +!287 = !{!228, !55, i64 464} +!288 = !{!228, !62, i64 472} +!289 = !{!228, !55, i64 480} +!290 = !{!228, !62, i64 488} diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_linked b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_linked deleted file mode 100755 index 9822f3969c41ced6b162e48a1b90b34040582632..0000000000000000000000000000000000000000 Binary files a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_linked and /dev/null differ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_linked.bc b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_linked.bc deleted file mode 100644 index 5509c97e970efee316d7083d6a574f7eaeeaf01a..0000000000000000000000000000000000000000 Binary files a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_linked.bc and /dev/null differ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_tune b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_tune deleted file mode 100755 index e661fd0c167c169b6e2f204313bf6c750b741fb7..0000000000000000000000000000000000000000 Binary files a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/build/vgg16_cifar10_tune and /dev/null differ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/run_data/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/run_data/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..449e2ceb26e4ee8f4873ee3fc1ec37d7e6196dfb --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/data/run_data/out-run-1 @@ -0,0 +1,257 @@ +size_in_bytes = 6912 +DEBUG: ***--- size_in_bytes = 6912 +DEBUG: Attempting to Allocate = 6912 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 27, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 294912 +DEBUG: ***--- size_in_bytes = 294912 +DEBUG: Attempting to Allocate = 294912 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 512 +DEBUG: ***--- size_in_bytes = 512 +DEBUG: Attempting to Allocate = 512 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 128, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 589824 +DEBUG: ***--- size_in_bytes = 589824 +DEBUG: Attempting to Allocate = 589824 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 1152, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 512 +DEBUG: ***--- size_in_bytes = 512 +DEBUG: Attempting to Allocate = 512 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 128, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1179648 +DEBUG: ***--- size_in_bytes = 1179648 +DEBUG: Attempting to Allocate = 1179648 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 1152, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2359296 +DEBUG: ***--- size_in_bytes = 2359296 +DEBUG: Attempting to Allocate = 2359296 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2304, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2359296 +DEBUG: ***--- size_in_bytes = 2359296 +DEBUG: Attempting to Allocate = 2359296 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2304, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 4718592 +DEBUG: ***--- size_in_bytes = 4718592 +DEBUG: Attempting to Allocate = 4718592 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2304, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1048576 +DEBUG: ***--- size_in_bytes = 1048576 +DEBUG: Attempting to Allocate = 1048576 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 262144, cStride = 262144, hStride = 512, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 20480 +DEBUG: ***--- size_in_bytes = 20480 +DEBUG: Attempting to Allocate = 20480 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 5120, cStride = 5120, hStride = 10, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 40 +DEBUG: ***--- size_in_bytes = 40 +DEBUG: Attempting to Allocate = 40 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 10, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INITIALIZING GPU 0 +CREATED HANDLES 0 +INFO: +WARNING: File 'opentuner_flags' not found + + +initializing tuner .... +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +Read PROMISE FLAGS 0 +DONE INTIALIZING GPU 0 +INFO: Reading Quantization Ranges File... +INFO: DONE. +INFO: Reading Configuration File... +DEBUG: first_line: +++++ +ERROR!: Please Add/Fix Baseline Time at Top of Config File.. \ No newline at end of file diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/predictive/vgg16_cifar10.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/predictive/vgg16_cifar10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b325a9fe2d122e74cdd2b80e2768e68591313bf --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar10/predictive/vgg16_cifar10.txt @@ -0,0 +1,913 @@ +3776.508929999999 ++++++ +conf1 1 1 89.96 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 11 add fp32 1 relu fp32 1 +9 gpu conv fp32 11 add fp32 1 relu fp32 1 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 11 add fp32 1 relu fp32 1 +12 gpu conv fp32 11 add fp32 1 relu fp32 1 +13 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 11 add fp32 1 relu fp32 1 +15 gpu mul fp32 11 add fp32 1 +16 gpu softmax fp32 1 +----- ++++++ +conf2 2.1225958306417145 1.9771056444390926 89.91 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf3 2.090180991844805 1.9532689756636086 89.82 0.14000000000000057 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf4 2.169931036393396 2.0048851858669283 89.53999999999999 0.4200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf5 2.1012179398201756 1.9325098819632314 89.42 0.539999999999992 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf6 2.2313002482945326 2.069581185407626 89.38000000000001 0.5799999999999841 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 158 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf7 2.143061101834193 1.9675759235961738 89.3 0.6599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf8 2.199379444387758 2.0314348091429677 89.2 0.7599999999999909 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf9 2.3236298452294624 2.156907976575644 89.03999999999999 0.9200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf10 2.3224369486241603 2.1560351277882046 89.03999999999999 0.9200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf11 2.358467412507993 2.1904290636262784 89.02 0.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf12 2.3633503986583126 2.1980949050120437 88.88000000000001 1.079999999999984 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf13 2.4903388172036043 2.3063593441573564 88.82 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf14 2.508156996742662 2.3204109539869595 88.78 1.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf15 2.4818531813049622 2.2910866330696744 88.75999999999999 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf16 2.4591564896606 2.272664410995804 88.74 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf17 2.5370582721089496 2.3464665753522405 88.72 1.2399999999999949 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf18 2.438100014978735 2.257620696759345 88.7 1.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf19 2.4776935382337006 2.2949598026093168 88.7 1.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf20 2.4380041604279596 2.254330054479329 88.68 1.279999999999987 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf21 2.4745444350223327 2.2883888475386525 88.64 1.3199999999999932 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf22 2.4136652022060625 2.2360545757445407 88.52 1.4399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf23 2.510093966915115 2.316437144001897 88.52 1.4399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf24 2.475990790728594 2.28127562431577 88.5 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf25 2.4761929121466926 2.290365501363375 88.5 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf26 2.4763575559033875 2.291312348847263 88.5 1.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf27 2.600249602991055 2.4123747341424644 88.06 1.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf28 2.596077615026303 2.4115375655840245 88.02 1.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf29 2.580888020555937 2.3840829703999833 87.88 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf30 2.556352783745439 2.3641413704751537 87.8 2.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf31 2.5559756082494527 2.3677471703724575 87.78 2.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 11 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf32 2.597413373332546 2.4091972878097585 87.76 2.1999999999999886 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf33 2.4797467027434656 2.2874608793842612 87.74 2.219999999999999 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf34 2.593675604602072 2.400513932866452 87.7 2.259999999999991 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf35 2.6300759173431336 2.432687374579977 87.62 2.339999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf36 2.5907083037103864 2.4042762580264356 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf37 2.6143261650366187 2.423427684623993 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf38 2.6144436259117203 2.4231961521843344 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf39 2.662088796913144 2.4660859696742032 87.6 2.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf40 2.6210428708834517 2.423389791646294 87.58 2.3799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf41 2.6399924349243533 2.4443864221157914 87.58 2.3799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf42 2.616443708384916 2.4217582570150697 87.58 2.3799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf43 2.6883473596205225 2.5036952786284137 87.5 2.4599999999999937 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf44 2.6117356623585875 2.420771216556161 87.48 2.4799999999999898 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf45 2.6359174040106708 2.444231592562593 87.48 2.4799999999999898 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf46 2.56504192294198 2.371871906722655 87.44 2.519999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf47 2.5652588453899727 2.3816996471861174 87.44 2.519999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf48 2.68806951500876 2.5007647690311425 87.14 2.819999999999993 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 166 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/Makefile index ff73a6307184c945b5366c093168314088a13392..199d24b54f856be3b5c858ccd4ba8269a7e73328 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/Makefile @@ -22,7 +22,7 @@ TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_au CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET -LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp +LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib @@ -35,12 +35,16 @@ PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(A VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-promise -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt - -CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs_base.txt +CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -64,14 +68,20 @@ $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/run_data/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/run_data/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..620f53ff3b149ce98a01d062fc03232255bf2715 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/data/run_data/out-run-1 @@ -0,0 +1,92865 @@ +size_in_bytes = 6912 +DEBUG: ***--- size_in_bytes = 6912 +DEBUG: Attempting to Allocate = 6912 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 27, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 147456 +DEBUG: ***--- size_in_bytes = 147456 +DEBUG: Attempting to Allocate = 147456 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 256 +DEBUG: ***--- size_in_bytes = 256 +DEBUG: Attempting to Allocate = 256 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 64, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 294912 +DEBUG: ***--- size_in_bytes = 294912 +DEBUG: Attempting to Allocate = 294912 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 576, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 512 +DEBUG: ***--- size_in_bytes = 512 +DEBUG: Attempting to Allocate = 512 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 128, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 589824 +DEBUG: ***--- size_in_bytes = 589824 +DEBUG: Attempting to Allocate = 589824 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 1152, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 512 +DEBUG: ***--- size_in_bytes = 512 +DEBUG: Attempting to Allocate = 512 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 128, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1179648 +DEBUG: ***--- size_in_bytes = 1179648 +DEBUG: Attempting to Allocate = 1179648 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 1152, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2359296 +DEBUG: ***--- size_in_bytes = 2359296 +DEBUG: Attempting to Allocate = 2359296 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2304, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2359296 +DEBUG: ***--- size_in_bytes = 2359296 +DEBUG: Attempting to Allocate = 2359296 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2304, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1024 +DEBUG: ***--- size_in_bytes = 1024 +DEBUG: Attempting to Allocate = 1024 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 256, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 4718592 +DEBUG: ***--- size_in_bytes = 4718592 +DEBUG: Attempting to Allocate = 4718592 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2304, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 9437184 +DEBUG: ***--- size_in_bytes = 9437184 +DEBUG: Attempting to Allocate = 9437184 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4608, cStride = 9, hStride = 3, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 1048576 +DEBUG: ***--- size_in_bytes = 1048576 +DEBUG: Attempting to Allocate = 1048576 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 262144, cStride = 262144, hStride = 512, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 2048 +DEBUG: ***--- size_in_bytes = 2048 +DEBUG: Attempting to Allocate = 2048 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 204800 +DEBUG: ***--- size_in_bytes = 204800 +DEBUG: Attempting to Allocate = 204800 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 51200, cStride = 51200, hStride = 100, wStride = 1 +DEBUG: tensor->data_format = 0 +size_in_bytes = 400 +DEBUG: ***--- size_in_bytes = 400 +DEBUG: Attempting to Allocate = 400 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INITIALIZING GPU 0 +CREATED HANDLES 0 +INFO: +WARNING: File 'opentuner_flags' not found + + +initializing tuner .... +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +Read PROMISE FLAGS 0 +DONE INTIALIZING GPU 0 +INFO: Reading Quantization Ranges File... +INFO: DONE. +INFO: Reading Configuration File... +DEBUG: first_line: 2000 +DEBUG: Baseline time: 2000.000000 + +DEBUG: line: +++++ +DEBUG: t: +++++ +DEBUG: +DEBUG: line: conf1 1 0 90.19 0 +DEBUG: t: conf1 +DEBUG: t: 1 +DEBUG: t: 0 +DEBUG: t: 90.19 +DEBUG: t: 0 +DEBUG: +DEBUG: line: 1 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 1 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 1 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 2 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +DEBUG: t: 2 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 4 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 3 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 3 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 8 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 4 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +DEBUG: t: 4 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 11 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 5 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 5 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 15 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 6 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 6 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 18 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 7 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +DEBUG: t: 7 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 21 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 8 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 8 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 25 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 9 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 9 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 28 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 10 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +DEBUG: t: 10 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 31 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 11 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 11 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 35 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 12 gpu conv fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 12 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 38 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 13 gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +DEBUG: t: 13 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 41 + +DEBUG: Found conv operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 14 gpu mul fp32 1 add fp32 1 relu fp32 1 +DEBUG: t: 14 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 45 + +DEBUG: Found mul operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 15 gpu mul fp32 1 add fp32 1 +DEBUG: t: 15 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 48 + +DEBUG: Found mul operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: 16 gpu softmax fp32 1 +DEBUG: t: 16 +DEBUG: t: gpu +DEBUG: t: softmax +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 50 + +DEBUG: Found softmax operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: ----- +DEBUG: t: ----- +DEBUG: +DEBUG: line: +++++ +DEBUG: t: +++++ +DEBUG: +DEBUG: line: conf2 1.5 0 90.19 0 +DEBUG: t: conf2 +DEBUG: t: 1.5 +DEBUG: t: 0 +DEBUG: t: 90.19 +DEBUG: t: 0 +DEBUG: +DEBUG: line: 1 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 1 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 1 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 2 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +DEBUG: t: 2 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 4 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 3 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 3 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 8 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 4 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +DEBUG: t: 4 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 11 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 5 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 5 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 15 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 6 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 6 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 18 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 7 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +DEBUG: t: 7 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 21 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 8 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 8 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 25 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 9 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 9 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 28 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 10 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +DEBUG: t: 10 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 31 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 11 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 11 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 35 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 12 gpu conv fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 12 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 38 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 13 gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +DEBUG: t: 13 +DEBUG: t: gpu +DEBUG: t: conv +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: pool_max +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 41 + +DEBUG: Found conv operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found pool_max operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 14 gpu mul fp16 1 add fp16 1 relu fp16 1 +DEBUG: t: 14 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: relu +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 45 + +DEBUG: Found mul operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found relu operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 15 gpu mul fp16 1 add fp16 1 +DEBUG: t: 15 +DEBUG: t: gpu +DEBUG: t: mul +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: t: add +DEBUG: t: fp16 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 48 + +DEBUG: Found mul operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: Found add operation +DEBUG: Found fp16 option +DEBUG: fp16 parameter: 1, ignoring +DEBUG: line: 16 gpu softmax fp32 1 +DEBUG: t: 16 +DEBUG: t: gpu +DEBUG: t: softmax +DEBUG: t: fp32 +DEBUG: t: 1 +DEBUG: +DEBUG: Found gpu configuration +DEBUG: *** firstTensorID = 50 + +DEBUG: Found softmax operation +DEBUG: Found fp32 option +DEBUG: fp32 parameter: 1, ignoring +DEBUG: line: ----- +DEBUG: t: ----- +DEBUG: +DEBUG: DONE. +INFO: Sorting autotuner configurations... +INFO: Done sorting. +DEBUG: start_idx = 1, end_idx = 2 +DEBUG: accuracy loss = 0.000000, speedup = 1.500000, at sp_idx = 1 +DEBUG: accuracy loss = 0.000000, energy = 0.000000, at en_idx = 1 +DEBUG: sp_notDominated = 1 +DEBUG: en_notDominated = 0 +INFO: Speedup Configurations ++++++ +conf1 1.000000 0.000000 90.190002 0.000000 +1 : gpu conv fp32 1 add fp32 1 relu fp32 1 +10 : gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +11 : gpu conv fp32 1 add fp32 1 relu fp32 1 +12 : gpu conv fp32 1 add fp32 1 relu fp32 1 +13 : gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +14 : gpu mul fp32 1 add fp32 1 relu fp32 1 +15 : gpu mul fp32 1 add fp32 1 +16 : gpu softmax fp32 1 +2 : gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +3 : gpu conv fp32 1 add fp32 1 relu fp32 1 +4 : gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +5 : gpu conv fp32 1 add fp32 1 relu fp32 1 +6 : gpu conv fp32 1 add fp32 1 relu fp32 1 +7 : gpu conv fp32 1 add fp32 1 relu fp32 1 pool_max fp32 1 +8 : gpu conv fp32 1 add fp32 1 relu fp32 1 +9 : gpu conv fp32 1 add fp32 1 relu fp32 1 +----- ++++++ +conf2 1.500000 0.000000 90.190002 0.000000 +1 : gpu conv fp16 1 add fp16 1 relu fp16 1 +10 : gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +11 : gpu conv fp16 1 add fp16 1 relu fp16 1 +12 : gpu conv fp16 1 add fp16 1 relu fp16 1 +13 : gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +14 : gpu mul fp16 1 add fp16 1 relu fp16 1 +15 : gpu mul fp16 1 add fp16 1 +16 : gpu softmax fp32 1 +2 : gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +3 : gpu conv fp16 1 add fp16 1 relu fp16 1 +4 : gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +5 : gpu conv fp16 1 add fp16 1 relu fp16 1 +6 : gpu conv fp16 1 add fp16 1 relu fp16 1 +7 : gpu conv fp16 1 add fp16 1 relu fp16 1 pool_max fp16 1 +8 : gpu conv fp16 1 add fp16 1 relu fp16 1 +9 : gpu conv fp16 1 add fp16 1 relu fp16 1 +----- +DEBUG: slowdowns file not found. Initializing slowdowns randomly. +*LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +* LLVM_SRC_ROOT = /home/akashk4/merge/profiling/hpvm/llvm +- knobs_file_path = /home/akashk4/merge/profiling/hpvm/llvm/projects/hpvm-tensor-rt/autotuner/data/global_knobs.txt +WARNING: pause_profiler was already called +Initializing policy object ... +DONE: Initializing policy object. +Select target device (0 for CPU, 1 fpr GPU): DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +INFO: Moving 6912 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.116993 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.117884 +INFO: TimeDuration, Event = Add_end, Time = 0.000891 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.117901 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.118728 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +INFO: Moving 147456 bytes from host to GPU +INFO: Moving 256 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.135503 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.136374 +INFO: TimeDuration, Event = Add_end, Time = 0.000871 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.136512 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.137343 +INFO: TimeDuration, Event = Relu_end, Time = 0.000832 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.137357 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.140231 +INFO: TimeDuration, Event = Pool_end, Time = 0.002874 +DEBUG: No data movement required - Data on Device +INFO: Moving 294912 bytes from host to GPU +INFO: Moving 512 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.166328 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.166789 +INFO: TimeDuration, Event = Add_end, Time = 0.000461 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.166804 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.167230 +INFO: TimeDuration, Event = Relu_end, Time = 0.000426 +DEBUG: No data movement required - Data on Device +INFO: Moving 589824 bytes from host to GPU +INFO: Moving 512 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.190182 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.190640 +INFO: TimeDuration, Event = Add_end, Time = 0.000458 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.190655 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.191082 +INFO: TimeDuration, Event = Relu_end, Time = 0.000427 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.191094 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.197806 +INFO: TimeDuration, Event = Pool_end, Time = 0.006711 +DEBUG: No data movement required - Data on Device +INFO: Moving 1179648 bytes from host to GPU +INFO: Moving 1024 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.212230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.212530 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.212546 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.212769 +INFO: TimeDuration, Event = Relu_end, Time = 0.000222 +DEBUG: No data movement required - Data on Device +INFO: Moving 2359296 bytes from host to GPU +INFO: Moving 1024 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.224065 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.224373 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.224383 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.224604 +INFO: TimeDuration, Event = Relu_end, Time = 0.000221 +DEBUG: No data movement required - Data on Device +INFO: Moving 2359296 bytes from host to GPU +INFO: Moving 1024 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.240357 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.240662 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.240678 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.240900 +INFO: TimeDuration, Event = Relu_end, Time = 0.000222 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.240913 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.243645 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +INFO: Moving 4718592 bytes from host to GPU +INFO: Moving 2048 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.253262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.253621 +INFO: TimeDuration, Event = Add_end, Time = 0.000359 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.253633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.253752 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +INFO: Moving 9437184 bytes from host to GPU +INFO: Moving 2048 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.264401 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.264763 +INFO: TimeDuration, Event = Add_end, Time = 0.000362 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.264781 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.264900 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +DEBUG: No data movement required - Data on Device +INFO: Moving 9437184 bytes from host to GPU +INFO: Moving 2048 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.276915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.277270 +INFO: TimeDuration, Event = Add_end, Time = 0.000355 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.277285 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.277405 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.277424 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.280160 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +INFO: Moving 9437184 bytes from host to GPU +INFO: Moving 2048 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.286450 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.286774 +INFO: TimeDuration, Event = Add_end, Time = 0.000324 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.286800 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.286839 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +INFO: Moving 9437184 bytes from host to GPU +INFO: Moving 2048 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.292505 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.292829 +INFO: TimeDuration, Event = Add_end, Time = 0.000324 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.292841 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.292882 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +INFO: Moving 9437184 bytes from host to GPU +INFO: Moving 2048 bytes from host to GPU +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.298548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.298866 +INFO: TimeDuration, Event = Add_end, Time = 0.000318 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.298880 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.298921 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.298936 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.299754 +INFO: TimeDuration, Event = Pool_end, Time = 0.000818 +DEBUG: No data movement required - Data on Device +INFO: Moving 1048576 bytes from host to GPU +INFO: Moving 2048 bytes from host to GPU +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.300267 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.300392 +INFO: TimeDuration, Event = Mul_end, Time = 0.000124 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.300404 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.300431 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.300439 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.300459 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +INFO: Moving 204800 bytes from host to GPU +INFO: Moving 400 bytes from host to GPU +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.300582 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.300635 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.300647 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.300666 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352788.300678 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352788.300774 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000096 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.364981, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.357100 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.357990 +INFO: TimeDuration, Event = Add_end, Time = 0.000889 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.358003 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.358838 +INFO: TimeDuration, Event = Relu_end, Time = 0.000834 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.374516 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.375398 +INFO: TimeDuration, Event = Add_end, Time = 0.000883 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.375413 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.376250 +INFO: TimeDuration, Event = Relu_end, Time = 0.000837 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.376266 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.379241 +INFO: TimeDuration, Event = Pool_end, Time = 0.002975 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.401088 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.401546 +INFO: TimeDuration, Event = Add_end, Time = 0.000459 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.401563 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.401991 +INFO: TimeDuration, Event = Relu_end, Time = 0.000428 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.423666 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.424129 +INFO: TimeDuration, Event = Add_end, Time = 0.000464 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.424145 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.424573 +INFO: TimeDuration, Event = Relu_end, Time = 0.000428 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.424586 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.431291 +INFO: TimeDuration, Event = Pool_end, Time = 0.006705 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.443791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.444093 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.444106 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.444357 +INFO: TimeDuration, Event = Relu_end, Time = 0.000251 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.454220 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.454522 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.454534 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.454755 +INFO: TimeDuration, Event = Relu_end, Time = 0.000221 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.468805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.469108 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.469121 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.469345 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.469359 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.472103 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.479914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.480259 +INFO: TimeDuration, Event = Add_end, Time = 0.000345 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.480269 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.480390 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.487406 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.487750 +INFO: TimeDuration, Event = Add_end, Time = 0.000344 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.487763 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.487882 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.500739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.501077 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.501091 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.501215 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.501235 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.503975 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.506410 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.506697 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.506711 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.506753 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.509141 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.509436 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.509450 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.509491 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.511860 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.512152 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.512166 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.512207 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.512221 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.512870 +INFO: TimeDuration, Event = Pool_end, Time = 0.000649 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.512888 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.512983 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.512997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.513019 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.513031 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.513050 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.513063 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.513113 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.513126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.513144 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352788.513157 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352788.513293 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000136 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 159.218708, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.557569 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.558472 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.558489 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.559312 +INFO: TimeDuration, Event = Relu_end, Time = 0.000823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.574180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.575077 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.575108 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.575953 +INFO: TimeDuration, Event = Relu_end, Time = 0.000846 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.575974 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.578901 +INFO: TimeDuration, Event = Pool_end, Time = 0.002927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.600750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.601216 +INFO: TimeDuration, Event = Add_end, Time = 0.000466 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.601232 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.601664 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.619816 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.620286 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.620297 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.620733 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.620746 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.627440 +INFO: TimeDuration, Event = Pool_end, Time = 0.006695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.638112 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.638415 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.638427 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.638650 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.647622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.647922 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.647934 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.648159 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.660620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.660919 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.660932 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.661155 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.661168 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.663911 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.671468 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.671795 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.671809 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.671929 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.677849 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.678175 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.678203 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.678327 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.686085 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.686412 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.686427 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.686548 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.686568 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.689346 +INFO: TimeDuration, Event = Pool_end, Time = 0.002777 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.691486 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.691777 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.691791 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.691832 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.694098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.694382 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.694396 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.694437 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.696722 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.697006 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.697020 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.697060 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.697072 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.697684 +INFO: TimeDuration, Event = Pool_end, Time = 0.000611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.697715 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.697803 +INFO: TimeDuration, Event = Mul_end, Time = 0.000089 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.697817 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.697836 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.697849 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.697866 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.697879 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.697923 +INFO: TimeDuration, Event = Mul_end, Time = 0.000044 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.697936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.697953 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352788.697966 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352788.698056 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000089 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 143.244434, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.737918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.738820 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.738833 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.739662 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.754026 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.754926 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.754940 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.755789 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.755803 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.758754 +INFO: TimeDuration, Event = Pool_end, Time = 0.002951 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.780593 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.781063 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.781076 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.781511 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.797643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.798118 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.798144 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.798578 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.798590 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.805274 +INFO: TimeDuration, Event = Pool_end, Time = 0.006685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.815173 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.815469 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.815483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.815708 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.824267 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.824570 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.824585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.824809 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.836705 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.837005 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.837029 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.837254 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.837269 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.840000 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.846441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.846776 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.846787 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.846907 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.852802 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.853135 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.853148 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.853269 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.860805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.861132 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.861146 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.861266 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.861283 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.864045 +INFO: TimeDuration, Event = Pool_end, Time = 0.002763 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.866168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.866451 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.866462 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.866502 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.868749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.869034 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.869059 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.869100 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.871328 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.871610 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.871624 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.871663 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.871675 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.872290 +INFO: TimeDuration, Event = Pool_end, Time = 0.000615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.872356 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.872446 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.872458 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.872478 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.872490 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.872508 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352788.872521 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352788.872566 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.872578 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.872595 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352788.872607 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352788.872693 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.441998, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.912919 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.913828 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.913842 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.914668 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.929044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.929952 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.929967 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.930822 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.930838 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.933770 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.955609 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.956082 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.956097 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.956537 +INFO: TimeDuration, Event = Relu_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.974907 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.975386 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.975398 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.975834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352788.975846 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352788.982532 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352788.994235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352788.994533 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352788.994547 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352788.994772 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.003331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.003634 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.003645 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.003869 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.015764 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.016063 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.016075 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.016298 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.016467 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.019056 +INFO: TimeDuration, Event = Pool_end, Time = 0.002589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.025495 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.025828 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.025841 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.025960 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.033579 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.033912 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.033926 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.034047 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.041894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.042224 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.042235 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.042355 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.042372 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.045135 +INFO: TimeDuration, Event = Pool_end, Time = 0.002763 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.047283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.047565 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.047576 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.047616 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.049896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.050181 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.050194 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.050235 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.052482 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.052765 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.052776 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.052816 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.052827 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.053442 +INFO: TimeDuration, Event = Pool_end, Time = 0.000616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.053460 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.053550 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.053581 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.053605 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.053617 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.053635 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.053648 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.053694 +INFO: TimeDuration, Event = Mul_end, Time = 0.000046 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.053705 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.053722 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352789.053756 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352789.053858 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000101 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 143.595193, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.093618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.094513 +INFO: TimeDuration, Event = Add_end, Time = 0.000896 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.094527 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.095350 +INFO: TimeDuration, Event = Relu_end, Time = 0.000823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.109711 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.110611 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.110626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.111476 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.111488 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.114436 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.136279 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.136749 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.136764 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.137198 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.154126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.154603 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.154628 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.155062 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.155075 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.161752 +INFO: TimeDuration, Event = Pool_end, Time = 0.006677 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.171645 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.171940 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.171952 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.172177 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.180745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.181047 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.181072 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.181297 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.194430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.194733 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.194747 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.194972 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.194984 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.197722 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.204155 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.204490 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.204503 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.204623 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.210755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.211082 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.211094 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.211214 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.219069 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.219396 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.219407 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.219528 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.219545 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.222308 +INFO: TimeDuration, Event = Pool_end, Time = 0.002762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.224451 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.224737 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.224749 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.224788 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.227022 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.227308 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.227321 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.227361 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.229606 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.229890 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.229902 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.229943 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.229955 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.230563 +INFO: TimeDuration, Event = Pool_end, Time = 0.000608 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.230580 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.230667 +INFO: TimeDuration, Event = Mul_end, Time = 0.000087 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.230679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.230698 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.230710 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.230728 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.230742 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.230787 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.230800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.230817 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352789.230830 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352789.230914 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.225374, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.271015 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.271916 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.271929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.272755 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.287106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.288008 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.288023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.288872 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.288884 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.291830 +INFO: TimeDuration, Event = Pool_end, Time = 0.002946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.313670 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.314139 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.314155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.314588 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.330713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.331187 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.331200 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.331636 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.331649 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.338340 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.348232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.348528 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.348542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.348769 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.357340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.357643 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.357656 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.357881 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.369771 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.370070 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.370082 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.370307 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.370319 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.373064 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.379502 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.379833 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.379845 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.379968 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.385799 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.386127 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.386140 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.386262 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.393815 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.394143 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.394176 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.394295 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.394312 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.397054 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.399199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.399480 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.399493 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.399533 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.401785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.402068 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.402080 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.402120 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.404442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.404719 +INFO: TimeDuration, Event = Add_end, Time = 0.000278 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.404733 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.404774 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.404786 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.405345 +INFO: TimeDuration, Event = Pool_end, Time = 0.000559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.405365 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.405452 +INFO: TimeDuration, Event = Mul_end, Time = 0.000087 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.405465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.405485 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.405496 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.405513 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.405539 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.405584 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.405596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.405614 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352789.405626 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352789.405709 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.485106, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.445486 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.446386 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.446399 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.447231 +INFO: TimeDuration, Event = Relu_end, Time = 0.000832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.461594 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.462493 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.462510 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.463368 +INFO: TimeDuration, Event = Relu_end, Time = 0.000858 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.463380 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.466322 +INFO: TimeDuration, Event = Pool_end, Time = 0.002943 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.488167 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.488638 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.488655 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.489089 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.505199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.505674 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.505686 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.506119 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.506131 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.512826 +INFO: TimeDuration, Event = Pool_end, Time = 0.006695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.522730 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.523026 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.523039 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.523266 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.531824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.532126 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.532138 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.532363 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.544252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.544551 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.544565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.544788 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.544801 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.547545 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.553977 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.554308 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.554321 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.554441 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.560269 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.560596 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.560610 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.560730 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.568286 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.568613 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.568626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.568747 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.568763 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.571515 +INFO: TimeDuration, Event = Pool_end, Time = 0.002752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.573657 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.573941 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.573952 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.573993 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.576239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.576522 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.576534 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.576574 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.578829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.579109 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.579122 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.579162 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.579174 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.579791 +INFO: TimeDuration, Event = Pool_end, Time = 0.000617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.579808 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.579895 +INFO: TimeDuration, Event = Mul_end, Time = 0.000088 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.579909 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.579929 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.579940 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.579958 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.579970 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.580014 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.580025 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.580065 +INFO: TimeDuration, Event = Add_end, Time = 0.000040 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352789.580079 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352789.580163 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.420071, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.620030 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.620929 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.620944 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.621773 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.636129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.637032 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.637048 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.637898 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.637911 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.640860 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.662693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.663160 +INFO: TimeDuration, Event = Add_end, Time = 0.000467 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.663176 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.663612 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.679753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.680233 +INFO: TimeDuration, Event = Add_end, Time = 0.000480 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.680245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.680680 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.680694 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.689482 +INFO: TimeDuration, Event = Pool_end, Time = 0.008789 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.697500 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.697795 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.697809 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.698034 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.706599 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.706903 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.706914 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.707142 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.719019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.719321 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.719333 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.719557 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.719569 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.722314 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.728750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.729082 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.729095 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.729216 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.735370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.735696 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.735709 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.735829 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.743367 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.743694 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.743707 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.743827 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.743842 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.746608 +INFO: TimeDuration, Event = Pool_end, Time = 0.002766 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.748727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.749013 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.749026 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.749065 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.751313 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.751595 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.751607 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.751647 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.753894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.754176 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.754189 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.754229 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.754240 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.754856 +INFO: TimeDuration, Event = Pool_end, Time = 0.000616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.754873 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.754961 +INFO: TimeDuration, Event = Mul_end, Time = 0.000088 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.754974 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.754993 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.755004 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.755022 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.755034 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.755079 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.755090 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.755108 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352789.755121 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352789.755204 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.900528, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.795187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.796086 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.796100 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.796931 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.811298 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.812198 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.812215 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.813069 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.813084 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.816007 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.837846 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.838315 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.838330 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.838763 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.856543 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.857021 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.857046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.857481 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.857493 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.864161 +INFO: TimeDuration, Event = Pool_end, Time = 0.006668 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.874073 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.874369 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.874381 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.874606 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.883181 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.883483 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.883496 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.883720 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.896654 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.896957 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.896973 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.897200 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.897213 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.899939 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.906380 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.906713 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.906724 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.906844 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.912671 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.913002 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.913015 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.913135 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.920981 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.921309 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.921322 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.921442 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.921459 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.924225 +INFO: TimeDuration, Event = Pool_end, Time = 0.002765 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.926363 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.926643 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.926655 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.926695 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.928934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.929217 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.929231 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.929271 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.931523 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.931808 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.931821 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.931860 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.931873 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.932482 +INFO: TimeDuration, Event = Pool_end, Time = 0.000609 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.932498 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.932585 +INFO: TimeDuration, Event = Mul_end, Time = 0.000087 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.932597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.932616 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.932628 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.932647 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352789.932659 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352789.932703 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.932717 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.932734 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352789.932748 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352789.932829 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.375419, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.972573 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.973475 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.973490 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.974317 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352789.988672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352789.989573 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352789.989588 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352789.990440 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352789.990453 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352789.993407 +INFO: TimeDuration, Event = Pool_end, Time = 0.002954 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.015249 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.015718 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.015734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.016168 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.032294 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.032773 +INFO: TimeDuration, Event = Add_end, Time = 0.000479 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.032786 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.033220 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.033231 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.039919 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.049823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.050118 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.050130 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.050355 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.058931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.059234 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.059247 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.059473 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.071359 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.071659 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.071671 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.071895 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.071908 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.074655 +INFO: TimeDuration, Event = Pool_end, Time = 0.002747 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.081089 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.081422 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.081434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.081555 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.087691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.088021 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.088032 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.088152 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.096009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.096332 +INFO: TimeDuration, Event = Add_end, Time = 0.000323 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.096433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.096551 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.096568 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.099251 +INFO: TimeDuration, Event = Pool_end, Time = 0.002683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.101373 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.101655 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.101666 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.101706 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.103939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.104223 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.104235 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.104276 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.106534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.106817 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.106827 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.106867 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.106879 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.107494 +INFO: TimeDuration, Event = Pool_end, Time = 0.000614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.107510 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.107603 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.107615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.107635 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.107646 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.107664 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.107677 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.107726 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.107738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.107755 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352790.107769 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352790.107886 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000117 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.951829, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.147506 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.148409 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.148471 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.149288 +INFO: TimeDuration, Event = Relu_end, Time = 0.000818 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.163651 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.164556 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.164571 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.165423 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.165435 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.168377 +INFO: TimeDuration, Event = Pool_end, Time = 0.002942 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.190215 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.190686 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.190701 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.191138 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.207252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.207726 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.207739 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.208173 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.208184 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.214879 +INFO: TimeDuration, Event = Pool_end, Time = 0.006694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.224787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.225083 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.225096 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.225320 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.233894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.234197 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.234210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.234434 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.246323 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.246621 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.246636 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.246860 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.246872 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.249617 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.256060 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.256395 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.256432 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.256552 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.262668 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.262999 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.263010 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.263131 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.270983 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.271307 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.271321 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.271440 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.271471 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.274224 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.276440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.276722 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.276735 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.276775 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.278989 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.279275 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.279288 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.279327 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.281577 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.281859 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.281872 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.281912 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.281924 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.282539 +INFO: TimeDuration, Event = Pool_end, Time = 0.000615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.282555 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.282642 +INFO: TimeDuration, Event = Mul_end, Time = 0.000087 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.282655 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.282674 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.282686 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.282704 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.282716 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.282789 +INFO: TimeDuration, Event = Mul_end, Time = 0.000073 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.282803 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.282821 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352790.282835 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352790.282961 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000126 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 138.044949, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.322747 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.323653 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.323668 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.324495 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.338852 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.339751 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.339765 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.340616 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.340706 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.343579 +INFO: TimeDuration, Event = Pool_end, Time = 0.002873 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.365415 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.365883 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.365899 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.366332 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.382462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.382937 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.382950 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.383384 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.383396 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.390090 +INFO: TimeDuration, Event = Pool_end, Time = 0.006695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.399999 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.400294 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.400313 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.400537 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.409097 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.409400 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.409412 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.409635 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.421526 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.421826 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.421838 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.422062 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.422088 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.424824 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.431253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.431585 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.431597 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.431717 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.437544 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.437874 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.437885 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.438005 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.445554 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.445881 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.445894 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.446013 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.446029 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.448793 +INFO: TimeDuration, Event = Pool_end, Time = 0.002764 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.450914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.451195 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.451208 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.451248 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.453497 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.453779 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.453792 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.453832 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.456100 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.456383 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.456393 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.456433 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.456445 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.457061 +INFO: TimeDuration, Event = Pool_end, Time = 0.000616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.457077 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.457164 +INFO: TimeDuration, Event = Mul_end, Time = 0.000087 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.457178 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.457197 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.457209 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.457227 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.457240 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.457285 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.457299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.457316 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352790.457330 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352790.457411 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.450833, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.496853 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.497761 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.497775 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.498602 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.512957 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.513857 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.513872 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.514724 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.514737 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.517683 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.539520 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.539990 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.540004 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.540440 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.556566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.557041 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.557054 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.557487 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.557518 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.564192 +INFO: TimeDuration, Event = Pool_end, Time = 0.006674 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.574093 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.574389 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.574401 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.574628 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.583194 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.583496 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.583508 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.583733 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.595633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.595932 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.595945 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.596169 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.596180 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.598925 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.605357 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.605690 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.605702 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.605824 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.611958 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.612288 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.612301 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.612428 +INFO: TimeDuration, Event = Relu_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.620274 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.620603 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.620617 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.620739 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.620755 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.623515 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.625630 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.625917 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.625930 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.625970 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.628216 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.628500 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.628513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.628553 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.630795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.631077 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.631089 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.631129 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.631142 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.631755 +INFO: TimeDuration, Event = Pool_end, Time = 0.000613 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.631771 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.631860 +INFO: TimeDuration, Event = Mul_end, Time = 0.000088 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.631872 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.631892 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.631903 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.631921 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.631933 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.631977 +INFO: TimeDuration, Event = Mul_end, Time = 0.000044 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.631989 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.632006 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352790.632018 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352790.632097 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000079 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.986113, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.671586 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.672488 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.672502 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.673328 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.687679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.688580 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.688593 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.689444 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.689458 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.692406 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.714244 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.714720 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.714735 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.715168 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.731297 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.731771 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.731782 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.732214 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.732226 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.738925 +INFO: TimeDuration, Event = Pool_end, Time = 0.006699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.748828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.749123 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.749135 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.749360 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.757929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.758231 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.758243 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.758466 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.770370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.770669 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.770682 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.770907 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.770920 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.773663 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.780101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.780432 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.780446 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.780566 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.786393 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.786726 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.786738 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.786877 +INFO: TimeDuration, Event = Relu_end, Time = 0.000139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.794400 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.794725 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.794738 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.794858 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.794876 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.797638 +INFO: TimeDuration, Event = Pool_end, Time = 0.002763 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.799795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.800082 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.800095 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.800135 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.802384 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.802669 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.802680 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.802720 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.804978 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.805258 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.805269 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.805309 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.805322 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.805938 +INFO: TimeDuration, Event = Pool_end, Time = 0.000617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.805955 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.806043 +INFO: TimeDuration, Event = Mul_end, Time = 0.000087 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.806055 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.806074 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.806086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.806103 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.806116 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.806161 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.806173 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.806190 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352790.806203 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352790.806281 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000077 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.410714, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.845694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.846594 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.846608 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.847429 +INFO: TimeDuration, Event = Relu_end, Time = 0.000821 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.861838 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.862740 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.862755 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.863606 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.863619 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.866566 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.888407 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.888876 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.888892 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.889325 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.905431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.905905 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.905918 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.906354 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.906366 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.913058 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.922959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.923254 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.923265 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.923491 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.932063 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.932365 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.932433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.932656 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.944501 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.944799 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.944813 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.945037 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.945049 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.947793 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.954237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.954569 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.954582 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.954703 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.960538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.960871 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.960882 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.961002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.968568 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.968897 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.968908 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.969028 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.969047 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.971812 +INFO: TimeDuration, Event = Pool_end, Time = 0.002765 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.973950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.974234 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.974246 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.974286 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.976518 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.976804 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.976817 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.976856 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.979142 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.979420 +INFO: TimeDuration, Event = Add_end, Time = 0.000278 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.979432 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.979472 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352790.979485 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352790.980088 +INFO: TimeDuration, Event = Pool_end, Time = 0.000604 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.980105 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.980192 +INFO: TimeDuration, Event = Mul_end, Time = 0.000087 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.980206 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.980225 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352790.980262 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352790.980281 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352790.980295 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352790.980390 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352790.980400 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352790.980418 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352790.980431 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352790.980509 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000077 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.559943, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.020286 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.021190 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.021205 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.022029 +INFO: TimeDuration, Event = Relu_end, Time = 0.000824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.036382 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.037283 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.037297 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.038150 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.038163 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.041107 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.062956 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.063423 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.063439 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.063871 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.079993 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.080468 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.080483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.080915 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.080926 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.087601 +INFO: TimeDuration, Event = Pool_end, Time = 0.006675 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.097513 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.097808 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.097820 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.098050 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.106621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.106924 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.106937 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.107162 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.119049 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.119349 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.119360 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.119584 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.119595 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.122341 +INFO: TimeDuration, Event = Pool_end, Time = 0.002746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.128779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.129111 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.129124 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.129243 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.135386 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.135715 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.135726 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.135847 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.143399 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.143725 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.143738 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.143859 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.143875 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.146639 +INFO: TimeDuration, Event = Pool_end, Time = 0.002764 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.148761 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.149045 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.149056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.149096 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.151360 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.151652 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.151664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.151704 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.153971 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.154251 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.154264 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.154305 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.154318 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.154932 +INFO: TimeDuration, Event = Pool_end, Time = 0.000615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.154949 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.155035 +INFO: TimeDuration, Event = Mul_end, Time = 0.000086 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.155048 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.155067 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.155078 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.155096 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.155107 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.155151 +INFO: TimeDuration, Event = Mul_end, Time = 0.000043 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.155164 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.155182 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352791.155195 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352791.155273 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000078 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.751190, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.194855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.195761 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.195774 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.196598 +INFO: TimeDuration, Event = Relu_end, Time = 0.000824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.210954 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.211851 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.211864 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.212716 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.212729 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.215681 +INFO: TimeDuration, Event = Pool_end, Time = 0.002952 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.237519 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.237989 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.238003 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.238438 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.254556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.255032 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.255045 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.255477 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.255488 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.262181 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.272079 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.272375 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.272433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.272657 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.281179 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.281482 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.281494 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.281719 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.293613 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.293914 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.293925 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.294150 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.294161 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.296906 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.303352 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.303680 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.303692 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.303812 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.309959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.310290 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.310302 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.310422 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.321101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.321437 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.321451 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.321580 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.321599 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.324342 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.326522 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.326805 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.326820 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.326863 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.329126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.329412 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.329424 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.329464 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.331710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.331994 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.332006 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.332046 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.332059 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.332677 +INFO: TimeDuration, Event = Pool_end, Time = 0.000618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.332695 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.332791 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.332805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.332824 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.332835 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.332852 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.332865 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.332917 +INFO: TimeDuration, Event = Mul_end, Time = 0.000051 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.332928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.332945 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352791.332958 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352791.333038 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000079 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.837414, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.372477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.373380 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.373396 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.374218 +INFO: TimeDuration, Event = Relu_end, Time = 0.000822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.388585 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.389484 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.389499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.390348 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.390362 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.393310 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.415149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.415617 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.415633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.416064 +INFO: TimeDuration, Event = Relu_end, Time = 0.000431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.432209 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.432688 +INFO: TimeDuration, Event = Add_end, Time = 0.000479 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.432701 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.433136 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.433148 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.439835 +INFO: TimeDuration, Event = Pool_end, Time = 0.006687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.449734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.450030 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.450041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.450265 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.458835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.459141 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.459152 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.459376 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.471266 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.471566 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.471578 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.471802 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.471814 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.474560 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.481004 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.481337 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.481349 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.481469 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.487626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.487951 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.487964 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.488085 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.495939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.496269 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.496282 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.496401 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.496416 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.499180 +INFO: TimeDuration, Event = Pool_end, Time = 0.002764 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.501312 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.501592 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.501606 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.501646 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.503896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.504177 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.504190 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.504230 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.506493 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.506776 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.506788 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.506829 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.506841 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.507456 +INFO: TimeDuration, Event = Pool_end, Time = 0.000614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.507472 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.507560 +INFO: TimeDuration, Event = Mul_end, Time = 0.000088 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.507573 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.507592 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.507604 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.507622 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.507636 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.507681 +INFO: TimeDuration, Event = Mul_end, Time = 0.000045 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.507694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.507711 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352791.507724 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352791.507803 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000079 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 138.031558, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.547520 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.548426 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.548475 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.549295 +INFO: TimeDuration, Event = Relu_end, Time = 0.000820 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.563653 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.564550 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.564565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.565423 +INFO: TimeDuration, Event = Relu_end, Time = 0.000859 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.565436 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.569910 +INFO: TimeDuration, Event = Pool_end, Time = 0.004474 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.592318 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.592796 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.592810 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.593244 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.607269 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.607738 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.607752 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.608186 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.608198 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.614896 +INFO: TimeDuration, Event = Pool_end, Time = 0.006698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.624797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.625096 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.625109 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.625335 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.633891 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.634194 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.634207 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.634432 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.646315 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.646614 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.646626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.646850 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.646862 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.649610 +INFO: TimeDuration, Event = Pool_end, Time = 0.002748 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.656210 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.656535 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.656548 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.656668 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.662516 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.662844 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.662856 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.662976 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.670836 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.671157 +INFO: TimeDuration, Event = Add_end, Time = 0.000321 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.671170 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.671290 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.671307 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.674079 +INFO: TimeDuration, Event = Pool_end, Time = 0.002772 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.676204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.676486 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.676498 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.676539 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.678807 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.679091 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.679102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.679142 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.681379 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.681664 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.681675 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.681716 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.681728 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.682339 +INFO: TimeDuration, Event = Pool_end, Time = 0.000611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.682356 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.682470 +INFO: TimeDuration, Event = Mul_end, Time = 0.000114 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.682482 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.682508 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.682519 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.682537 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.682550 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.682598 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.682611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.682629 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352791.682641 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352791.682761 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000121 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.960427, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.723787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.724696 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.724711 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.725537 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.739895 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.740797 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.740811 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.741663 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.741675 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.744620 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.766467 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.766937 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.766953 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.767387 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.783487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.783962 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.783973 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.784409 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.784419 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.791113 +INFO: TimeDuration, Event = Pool_end, Time = 0.006695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.801007 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.801302 +INFO: TimeDuration, Event = Add_end, Time = 0.000295 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.801315 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.801540 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.810119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.810421 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.810435 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.810658 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.822559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.822858 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.822870 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.823093 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.823106 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.825852 +INFO: TimeDuration, Event = Pool_end, Time = 0.002746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.832292 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.832631 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.832643 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.832763 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.838582 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.838910 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.838922 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.839043 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.846906 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.847231 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.847256 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.847377 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.847393 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.850148 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.852266 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.852550 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.852563 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.852604 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.854844 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.855125 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.855137 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.855178 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.857435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.857716 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.857738 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.857779 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.857791 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.858397 +INFO: TimeDuration, Event = Pool_end, Time = 0.000606 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.858413 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.858501 +INFO: TimeDuration, Event = Mul_end, Time = 0.000088 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.858514 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.858533 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.858546 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.858564 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352791.858576 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352791.858621 +INFO: TimeDuration, Event = Mul_end, Time = 0.000044 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.858633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.858650 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352791.858662 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352791.858738 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000076 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.694785, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.898326 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.899232 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.899246 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.900071 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.914430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.915328 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.915341 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.916197 +INFO: TimeDuration, Event = Relu_end, Time = 0.000856 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.916210 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.919161 +INFO: TimeDuration, Event = Pool_end, Time = 0.002951 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.940994 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.941464 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.941479 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.941910 +INFO: TimeDuration, Event = Relu_end, Time = 0.000431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.958053 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.958527 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.958554 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.958991 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.959002 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352791.965685 +INFO: TimeDuration, Event = Pool_end, Time = 0.006683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.975578 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.975873 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.975886 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.976113 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.984674 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.984977 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.984990 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.985215 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352791.997108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352791.997407 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352791.997430 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352791.997655 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352791.997666 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.000403 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.007051 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.007384 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.007397 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.007516 +INFO: TimeDuration, Event = Relu_end, Time = 0.000119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.013347 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.013676 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.013689 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.013809 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.023033 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.023361 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.023373 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.023496 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.023518 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.026278 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.028398 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.028679 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.028692 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.028732 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.030989 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.031275 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.031287 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.031327 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.033567 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.033850 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.033863 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.033903 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.033915 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.034526 +INFO: TimeDuration, Event = Pool_end, Time = 0.000612 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.034543 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.034635 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.034648 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.034670 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.034680 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.034698 +INFO: TimeDuration, Event = Relu_end, Time = 0.000018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.034712 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.034760 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.034772 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.034790 +INFO: TimeDuration, Event = Add_end, Time = 0.000018 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352792.034802 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352792.034928 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000126 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.304640, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.076930 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.077833 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.077848 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.078676 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.093035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.093941 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.093955 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.094808 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.094821 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.097760 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.119596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.120065 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.120080 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.120515 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.136642 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.137119 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.137130 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.137564 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.137576 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.144269 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.154175 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.154470 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.154482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.154707 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.163273 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.163574 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.163608 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.163832 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.175694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.175993 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.176005 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.176230 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.176242 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.178987 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.185433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.185766 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.185778 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.185898 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.191729 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.192055 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.192066 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.192187 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.199735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.200061 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.200086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.200207 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.200225 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.202976 +INFO: TimeDuration, Event = Pool_end, Time = 0.002751 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.205119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.205400 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.205412 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.205453 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.207702 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.207987 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.208000 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.208040 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.210289 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.210567 +INFO: TimeDuration, Event = Add_end, Time = 0.000278 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.210578 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.210618 +INFO: TimeDuration, Event = Relu_end, Time = 0.000040 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.210631 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.211251 +INFO: TimeDuration, Event = Pool_end, Time = 0.000620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.211267 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.211355 +INFO: TimeDuration, Event = Mul_end, Time = 0.000088 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.211367 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.211387 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.211397 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.211414 +INFO: TimeDuration, Event = Relu_end, Time = 0.000017 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.211431 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.211475 +INFO: TimeDuration, Event = Mul_end, Time = 0.000044 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.211488 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.211506 +INFO: TimeDuration, Event = Add_end, Time = 0.000017 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352792.211518 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352792.211596 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000079 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.379113, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.251242 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.252143 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.252158 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.252986 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.267341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.268244 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.268260 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.269112 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.269125 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.272068 +INFO: TimeDuration, Event = Pool_end, Time = 0.002943 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.293903 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.294372 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.294387 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.294819 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.310961 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.311435 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.311447 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.311883 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.311894 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.318600 +INFO: TimeDuration, Event = Pool_end, Time = 0.006706 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.328494 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.328789 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.328802 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.329027 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.337600 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.337903 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.337915 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.338140 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.350035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.350334 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.350346 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.350572 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.350583 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.353327 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.360904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.361280 +INFO: TimeDuration, Event = Add_end, Time = 0.000377 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.361314 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.361457 +INFO: TimeDuration, Event = Relu_end, Time = 0.000143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.368088 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.368442 +INFO: TimeDuration, Event = Add_end, Time = 0.000354 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.368474 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.368613 +INFO: TimeDuration, Event = Relu_end, Time = 0.000139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.376566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.376899 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.376915 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.377041 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.377067 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.379814 +INFO: TimeDuration, Event = Pool_end, Time = 0.002747 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.382106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.382394 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.382410 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.382456 +INFO: TimeDuration, Event = Relu_end, Time = 0.000047 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.384829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.385121 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.385136 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.385181 +INFO: TimeDuration, Event = Relu_end, Time = 0.000045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.387515 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.387802 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.387818 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.387863 +INFO: TimeDuration, Event = Relu_end, Time = 0.000045 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.387877 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.388519 +INFO: TimeDuration, Event = Pool_end, Time = 0.000642 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.388542 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.388645 +INFO: TimeDuration, Event = Mul_end, Time = 0.000102 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.388662 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.388688 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.388702 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.388725 +INFO: TimeDuration, Event = Relu_end, Time = 0.000022 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.388742 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.388800 +INFO: TimeDuration, Event = Mul_end, Time = 0.000058 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.388816 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.388838 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352792.388855 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352792.388939 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.416118, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.434453 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.435394 +INFO: TimeDuration, Event = Add_end, Time = 0.000941 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.435431 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.436265 +INFO: TimeDuration, Event = Relu_end, Time = 0.000834 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.453673 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.454584 +INFO: TimeDuration, Event = Add_end, Time = 0.000910 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.454606 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.455463 +INFO: TimeDuration, Event = Relu_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.455482 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.458405 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.480345 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.480821 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.480844 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.481282 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.497559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.498039 +INFO: TimeDuration, Event = Add_end, Time = 0.000481 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.498056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.498492 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.498510 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.505186 +INFO: TimeDuration, Event = Pool_end, Time = 0.006675 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.515285 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.515588 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.515606 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.515837 +INFO: TimeDuration, Event = Relu_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.524677 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.524988 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.525005 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.525234 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.537297 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.537604 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.537625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.537855 +INFO: TimeDuration, Event = Relu_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.537873 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.540592 +INFO: TimeDuration, Event = Pool_end, Time = 0.002719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.547208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.547550 +INFO: TimeDuration, Event = Add_end, Time = 0.000343 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.547569 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.547693 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.553726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.554063 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.554082 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.554208 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.562155 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.562485 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.562497 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.562618 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.562639 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.565395 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.567551 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.567836 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.567870 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.567912 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.570161 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.570453 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.570467 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.570512 +INFO: TimeDuration, Event = Relu_end, Time = 0.000046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.572769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.573050 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.573062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.573103 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.573115 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.573730 +INFO: TimeDuration, Event = Pool_end, Time = 0.000615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.573749 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.573840 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.573855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.573877 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.573890 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.573910 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.573925 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.573974 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.573988 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.574007 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352792.574021 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352792.574104 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 143.469573, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.614050 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.614954 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.614969 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.615794 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.630177 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.631078 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.631094 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.631943 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.631956 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.634904 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.656744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.657216 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.657232 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.657666 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.673800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.674276 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.674290 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.674726 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.674738 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.681426 +INFO: TimeDuration, Event = Pool_end, Time = 0.006687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.691594 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.691894 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.691908 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.692135 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.700710 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.701016 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.701029 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.701258 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.713136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.713440 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.713455 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.713681 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.713693 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.716432 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.722885 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.723218 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.723232 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.723352 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.729192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.729516 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.729530 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.729651 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.737209 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.737538 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.737552 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.737674 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.737692 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.740448 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.742603 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.742886 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.742898 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.742940 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.745180 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.745464 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.745477 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.745519 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.747766 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.748048 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.748062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.748103 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.748127 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.748728 +INFO: TimeDuration, Event = Pool_end, Time = 0.000601 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.748747 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.748838 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.748852 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.748873 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.748884 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.748904 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.748916 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.748964 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.748977 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.748996 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352792.749010 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352792.749094 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 137.776289, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.788776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.789673 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.789688 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.790514 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.804984 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.805876 +INFO: TimeDuration, Event = Add_end, Time = 0.000892 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.805893 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.806747 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.806762 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.809707 +INFO: TimeDuration, Event = Pool_end, Time = 0.002946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.831545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.832015 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.832031 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.832464 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.854861 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.855343 +INFO: TimeDuration, Event = Add_end, Time = 0.000481 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.855358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.855796 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.855812 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.862480 +INFO: TimeDuration, Event = Pool_end, Time = 0.006667 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.872646 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.872944 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.872959 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.873186 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.881878 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.882182 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.882195 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.882421 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.894491 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.894794 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.894806 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.895034 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.895047 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.897784 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.904379 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.904716 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.904729 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.904850 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.911075 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.911412 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.911424 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.911547 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.919512 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.919844 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.919857 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.919979 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.919998 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.922753 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.924976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.925265 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.925279 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.925321 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.927609 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.927898 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.927914 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.927956 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.930235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.930520 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.930534 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.930576 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.930589 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.931211 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.931229 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.931321 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.931335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.931357 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.931370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.931390 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352792.931405 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352792.931453 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.931466 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.931485 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352792.931499 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352792.931581 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 145.549538, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.971995 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.972901 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.972918 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.973742 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352792.988225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352792.989129 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352792.989145 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352792.990002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352792.990016 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352792.992955 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.014800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.015270 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.015285 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.015718 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.032429 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.032903 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.032916 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.033356 +INFO: TimeDuration, Event = Relu_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.033369 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.040054 +INFO: TimeDuration, Event = Pool_end, Time = 0.006685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.050224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.050522 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.050535 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.050785 +INFO: TimeDuration, Event = Relu_end, Time = 0.000249 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.059455 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.059760 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.059773 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.060002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.072081 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.072389 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.072434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.072667 +INFO: TimeDuration, Event = Relu_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.072680 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.075376 +INFO: TimeDuration, Event = Pool_end, Time = 0.002696 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.081959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.082294 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.082307 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.082429 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.088441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.088771 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.088786 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.088909 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.096814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.097144 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.097156 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.097279 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.097299 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.100055 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.102270 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.102555 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.102567 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.102609 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.104892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.105180 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.105195 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.105236 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.107502 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.107788 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.107825 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.107869 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.107882 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.108476 +INFO: TimeDuration, Event = Pool_end, Time = 0.000593 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.108495 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.108591 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.108605 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.108629 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.108640 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.108660 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.108675 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.108726 +INFO: TimeDuration, Event = Mul_end, Time = 0.000052 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.108739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.108758 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352793.108773 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352793.108887 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000114 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.576377, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.149169 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.150082 +INFO: TimeDuration, Event = Add_end, Time = 0.000913 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.150097 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.150924 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.165468 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.166369 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.166386 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.167245 +INFO: TimeDuration, Event = Relu_end, Time = 0.000859 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.167259 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.170194 +INFO: TimeDuration, Event = Pool_end, Time = 0.002935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.192035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.192506 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.192523 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.192958 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.209673 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.210150 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.210162 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.210599 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.210612 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.217300 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.227463 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.227761 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.227775 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.228002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.236698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.237002 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.237014 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.237240 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.249304 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.249604 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.249617 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.249843 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.249856 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.252600 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.259183 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.259518 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.259531 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.259651 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.265873 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.266203 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.266216 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.266338 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.274302 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.274633 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.274646 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.274768 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.274791 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.277543 +INFO: TimeDuration, Event = Pool_end, Time = 0.002752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.279768 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.280055 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.280067 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.280109 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.282389 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.282679 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.282691 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.282734 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.284997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.285284 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.285298 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.285341 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.285355 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.285972 +INFO: TimeDuration, Event = Pool_end, Time = 0.000617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.285992 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.286084 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.286098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.286119 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.286133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.286152 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.286166 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.286214 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.286228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.286247 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352793.286262 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352793.286346 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.905616, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.326903 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.327816 +INFO: TimeDuration, Event = Add_end, Time = 0.000914 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.327842 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.328686 +INFO: TimeDuration, Event = Relu_end, Time = 0.000843 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.343253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.344159 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.344175 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.345029 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.345044 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.347978 +INFO: TimeDuration, Event = Pool_end, Time = 0.002934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.369820 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.370290 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.370306 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.370739 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.387485 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.387961 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.387975 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.388411 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.388425 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.395111 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.405277 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.405575 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.405588 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.405815 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.414510 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.414814 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.414827 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.415054 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.427120 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.427422 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.427437 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.427663 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.427676 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.430414 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.437000 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.437337 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.437349 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.437472 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.443692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.444022 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.444035 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.444156 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.452126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.452456 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.452470 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.452592 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.452610 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.455368 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.457603 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.457891 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.457905 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.457947 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.460217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.460505 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.460519 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.460562 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.462832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.463114 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.463126 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.463169 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.463181 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.463803 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.463821 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.463921 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.463935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.463956 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.463969 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.463989 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.464004 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.464055 +INFO: TimeDuration, Event = Mul_end, Time = 0.000050 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.464067 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.464087 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352793.464100 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352793.464183 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.113256, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.504207 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.505115 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.505130 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.505955 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.520438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.521341 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.521356 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.522210 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.522223 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.525164 +INFO: TimeDuration, Event = Pool_end, Time = 0.002941 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.547004 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.547473 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.547490 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.547923 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.564655 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.565129 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.565142 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.565575 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.565588 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.572281 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.582448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.582744 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.582757 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.582983 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.591674 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.591978 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.591992 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.592218 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.604287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.604594 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.604609 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.604835 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.604848 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.607580 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.614172 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.614504 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.614517 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.614639 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.620565 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.620896 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.620909 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.621032 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.628698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.629028 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.629056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.629180 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.629197 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.631940 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.634161 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.634451 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.634465 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.634510 +INFO: TimeDuration, Event = Relu_end, Time = 0.000045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.636784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.637069 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.637084 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.637126 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.639417 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.639702 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.639715 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.639757 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.639769 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.640393 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.640408 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.640499 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.640515 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.640536 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.640548 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.640567 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.640580 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.640628 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.640641 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.640661 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352793.640676 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352793.640759 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.308825, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.682406 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.683314 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.683330 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.684160 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.698653 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.699553 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.699570 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.700427 +INFO: TimeDuration, Event = Relu_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.700438 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.703378 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.725232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.725708 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.725726 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.726162 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.742865 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.743340 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.743353 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.743789 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.743802 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.750498 +INFO: TimeDuration, Event = Pool_end, Time = 0.006696 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.760661 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.760959 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.760998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.761224 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.769900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.770204 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.770217 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.770441 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.782512 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.782812 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.782826 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.783052 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.783065 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.785805 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.792398 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.792737 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.792749 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.792870 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.798780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.799113 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.799127 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.799249 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.806902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.807230 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.807245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.807369 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.807388 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.810144 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.812439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.812726 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.812739 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.812782 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.815025 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.815315 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.815329 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.815371 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.817688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.817975 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.817989 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.818031 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.818044 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.818664 +INFO: TimeDuration, Event = Pool_end, Time = 0.000619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.818682 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.818773 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.818788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.818816 +INFO: TimeDuration, Event = Add_end, Time = 0.000028 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.818828 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.818848 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.818863 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.818914 +INFO: TimeDuration, Event = Mul_end, Time = 0.000051 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.818927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.818946 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352793.818961 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352793.819047 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.335695, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.858990 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.859884 +INFO: TimeDuration, Event = Add_end, Time = 0.000893 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.859899 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.860730 +INFO: TimeDuration, Event = Relu_end, Time = 0.000832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.875215 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.876118 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.876133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.876988 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.877002 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.879943 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.901785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.902255 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.902272 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.902707 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.919435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.919910 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.919925 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.920361 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.920436 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.927063 +INFO: TimeDuration, Event = Pool_end, Time = 0.006627 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.937229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.937527 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.937542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.937768 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.946459 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.946763 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.946775 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.947001 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.959113 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.959415 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.959429 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.959656 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.959670 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.962409 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.968991 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.969326 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.969340 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.969462 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.975692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.976024 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.976038 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.976161 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.983810 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.984138 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.984152 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.984276 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.984295 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.987673 +INFO: TimeDuration, Event = Pool_end, Time = 0.003378 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.989599 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.989890 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.989916 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.989961 +INFO: TimeDuration, Event = Relu_end, Time = 0.000045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.992229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.992520 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.992535 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.992579 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.994859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.995147 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.995160 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.995202 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352793.995215 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352793.995835 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.995854 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.995949 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.995963 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.995986 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352793.995998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352793.996019 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352793.996032 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352793.996088 +INFO: TimeDuration, Event = Mul_end, Time = 0.000055 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352793.996115 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352793.996136 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352793.996151 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352793.996285 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000133 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.090612, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.039040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.039945 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.039960 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.040787 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.055281 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.056183 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.056199 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.057048 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.057084 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.060008 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.081847 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.082316 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.082332 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.082763 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.099515 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.099989 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.100002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.100437 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.100451 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.107139 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.117305 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.117602 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.117623 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.117850 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.126529 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.126833 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.126848 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.127073 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.139138 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.139440 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.139454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.139680 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.139692 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.142432 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.149034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.149372 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.149386 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.149507 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.155721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.156053 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.156065 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.156187 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.164164 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.164493 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.164507 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.164629 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.164647 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.167396 +INFO: TimeDuration, Event = Pool_end, Time = 0.002749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.169611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.169900 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.169914 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.169956 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.172234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.172521 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.172536 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.172578 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.174857 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.175143 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.175156 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.175198 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.175210 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.175831 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.175849 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.175940 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.175954 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.175976 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.175997 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.176017 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.176031 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.176079 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.176091 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.176111 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352794.176125 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352794.176206 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.942494, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.215996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.216898 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.216913 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.217747 +INFO: TimeDuration, Event = Relu_end, Time = 0.000834 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.232226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.233128 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.233144 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.233990 +INFO: TimeDuration, Event = Relu_end, Time = 0.000846 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.234003 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.236948 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.259843 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.260315 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.260473 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.260909 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.277530 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.278010 +INFO: TimeDuration, Event = Add_end, Time = 0.000480 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.278024 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.278463 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.278488 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.285154 +INFO: TimeDuration, Event = Pool_end, Time = 0.006666 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.295320 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.295618 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.295647 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.295873 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.304721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.305027 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.305041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.305268 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.317334 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.317639 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.317653 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.317898 +INFO: TimeDuration, Event = Relu_end, Time = 0.000245 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.317913 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.320627 +INFO: TimeDuration, Event = Pool_end, Time = 0.002714 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.327210 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.327548 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.327561 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.327684 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.333597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.333931 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.333946 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.334069 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.342027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.342356 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.342370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.342492 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.342512 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.345266 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.347480 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.347765 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.347778 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.347820 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.350102 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.350393 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.350406 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.350448 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.352733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.353018 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.353031 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.353074 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.353087 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.353706 +INFO: TimeDuration, Event = Pool_end, Time = 0.000619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.353725 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.353817 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.353831 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.353852 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.353865 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.353886 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.353900 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.353948 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.353962 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.353981 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352794.353995 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352794.354077 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.706851, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.394110 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.395022 +INFO: TimeDuration, Event = Add_end, Time = 0.000912 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.395036 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.395864 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.410361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.411260 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.411277 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.412131 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.412145 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.415083 +INFO: TimeDuration, Event = Pool_end, Time = 0.002938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.436927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.437397 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.437414 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.437846 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.454593 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.455069 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.455083 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.455521 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.455534 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.462221 +INFO: TimeDuration, Event = Pool_end, Time = 0.006687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.472386 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.472684 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.472697 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.472923 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.481617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.481920 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.481933 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.482159 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.494227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.494529 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.494542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.494768 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.494781 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.497521 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.504108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.504444 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.504458 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.504579 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.510803 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.511135 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.511148 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.511271 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.518922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.519253 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.519266 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.519389 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.519407 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.522165 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.524460 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.524745 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.524757 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.524799 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.526998 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.527284 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.527297 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.527361 +INFO: TimeDuration, Event = Relu_end, Time = 0.000064 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.529636 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.529919 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.529933 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.529975 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.529988 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.530611 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.530629 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.530719 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.530734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.530755 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.530766 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.530786 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.530799 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.530848 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.530862 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.530880 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352794.530896 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352794.530977 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.561853, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.571286 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.572180 +INFO: TimeDuration, Event = Add_end, Time = 0.000893 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.572196 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.573025 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.587508 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.588410 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.588438 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.589291 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.589305 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.592234 +INFO: TimeDuration, Event = Pool_end, Time = 0.002929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.614103 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.614571 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.614589 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.615021 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.631739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.632213 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.632228 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.632662 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.632675 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.639366 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.649547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.649844 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.649859 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.650086 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.658758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.659063 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.659076 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.659304 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.671366 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.671668 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.671680 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.671907 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.671920 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.674661 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.681246 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.681577 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.681591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.681711 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.690630 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.690963 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.690979 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.691102 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.698745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.699080 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.699093 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.699216 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.699236 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.701986 +INFO: TimeDuration, Event = Pool_end, Time = 0.002750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.704252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.704542 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.704559 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.704603 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.706899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.707191 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.707205 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.707248 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.709524 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.709809 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.709824 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.709866 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.709878 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.710500 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.710518 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.710609 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.710624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.710646 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.710657 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.710676 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.710692 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.710739 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.710752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.710771 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352794.710786 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352794.710868 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 142.353208, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.750828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.751737 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.751752 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.752577 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.767106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.768015 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.768030 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.768888 +INFO: TimeDuration, Event = Relu_end, Time = 0.000858 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.768904 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.771831 +INFO: TimeDuration, Event = Pool_end, Time = 0.002927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.793678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.794149 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.794164 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.794600 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.811321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.811797 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.811810 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.812246 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.812259 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.818934 +INFO: TimeDuration, Event = Pool_end, Time = 0.006676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.829104 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.829403 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.829416 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.829643 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.838343 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.838648 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.838660 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.838887 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.850955 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.851257 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.851283 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.851510 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.851522 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.854250 +INFO: TimeDuration, Event = Pool_end, Time = 0.002728 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.860832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.861166 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.861181 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.861302 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.867220 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.867550 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.867564 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.867687 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.875340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.875672 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.875686 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.875808 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.875826 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.878581 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.880800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.881086 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.881099 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.881142 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.883432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.883719 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.883733 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.883775 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.886056 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.886340 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.886353 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.886396 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.886409 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.887031 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.887051 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.887142 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.887156 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.887177 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.887188 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.887208 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352794.887223 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352794.887271 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.887287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.887307 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352794.887321 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352794.887401 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000080 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.393571, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.927896 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.928809 +INFO: TimeDuration, Event = Add_end, Time = 0.000913 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.928828 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.929663 +INFO: TimeDuration, Event = Relu_end, Time = 0.000835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.944148 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.945050 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.945069 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.945923 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.945936 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.948873 +INFO: TimeDuration, Event = Pool_end, Time = 0.002938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.970727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.971201 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.971216 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.971650 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352794.988381 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352794.988857 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352794.988870 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352794.989305 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352794.989318 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352794.996008 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.006168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.006465 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.006478 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.006703 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.015401 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.015703 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.015717 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.015943 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.028016 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.028318 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.028454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.028681 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.028694 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.031311 +INFO: TimeDuration, Event = Pool_end, Time = 0.002617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.037894 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.038227 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.038240 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.038361 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.044585 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.044917 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.044931 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.045053 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.052752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.053079 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.053092 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.053214 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.053232 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.055992 +INFO: TimeDuration, Event = Pool_end, Time = 0.002761 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.058204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.058491 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.058504 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.058546 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.060829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.061118 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.061132 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.061174 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.063456 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.063739 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.063752 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.063794 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.063806 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.064431 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.064449 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.064542 +INFO: TimeDuration, Event = Mul_end, Time = 0.000093 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.064555 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.064579 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.064591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.064610 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.064626 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.064674 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.064686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.064706 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352795.064719 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352795.064804 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.581644, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.105062 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.105965 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.105981 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.106806 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.121287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.122187 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.122204 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.123056 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.123070 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.126014 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.147854 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.148324 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.148443 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.148875 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.165509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.165983 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.165998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.166431 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.166445 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.173137 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.183300 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.183597 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.183611 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.183836 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.192534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.192838 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.192853 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.193078 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.205143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.205445 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.205459 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.205684 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.205697 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.208435 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.215028 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.215363 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.215376 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.215497 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.221412 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.221742 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.221755 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.221878 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.229839 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.230171 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.230184 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.230305 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.230323 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.233080 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.235291 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.235577 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.235590 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.235632 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.237944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.238231 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.238245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.238287 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.240564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.240849 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.240862 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.240904 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.240936 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.241538 +INFO: TimeDuration, Event = Pool_end, Time = 0.000602 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.241560 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.241652 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.241667 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.241688 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.241699 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.241719 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.241733 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.241781 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.241793 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.241813 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352795.241827 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352795.241906 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000079 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.518731, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.282159 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.283060 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.283076 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.283901 +INFO: TimeDuration, Event = Relu_end, Time = 0.000824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.298386 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.299283 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.299299 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.300148 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.300163 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.303113 +INFO: TimeDuration, Event = Pool_end, Time = 0.002951 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.328114 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.328597 +INFO: TimeDuration, Event = Add_end, Time = 0.000483 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.328616 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.329060 +INFO: TimeDuration, Event = Relu_end, Time = 0.000444 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.343253 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.343725 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.343741 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.344176 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.344189 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.350882 +INFO: TimeDuration, Event = Pool_end, Time = 0.006693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.361054 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.361356 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.361370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.361596 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.370288 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.370593 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.370605 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.370833 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.382890 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.383189 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.383203 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.383428 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.383442 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.386183 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.392767 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.393095 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.393107 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.393229 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.399171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.399498 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.399512 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.399634 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.407290 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.407614 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.407628 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.407749 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.407766 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.410531 +INFO: TimeDuration, Event = Pool_end, Time = 0.002764 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.412772 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.413056 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.413069 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.413110 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.415402 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.415692 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.415705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.415746 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.418034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.418319 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.418332 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.418374 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.418402 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.419007 +INFO: TimeDuration, Event = Pool_end, Time = 0.000606 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.419030 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.419130 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.419143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.419167 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.419178 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.419197 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.419211 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.419266 +INFO: TimeDuration, Event = Mul_end, Time = 0.000055 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.419280 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.419299 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352795.419314 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352795.419395 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.970234, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.459498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.460401 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.460476 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.461293 +INFO: TimeDuration, Event = Relu_end, Time = 0.000817 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.475804 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.476707 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.476724 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.477579 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.477592 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.480532 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.502370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.502840 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.502856 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.503292 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.520024 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.520499 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.520513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.520947 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.520960 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.527651 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.537812 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.538110 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.538148 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.538375 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.547209 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.547516 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.547529 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.547755 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.560266 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.560570 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.560585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.560812 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.560826 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.563560 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.570147 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.570482 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.570495 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.570616 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.576534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.576868 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.576882 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.577005 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.584970 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.585304 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.585317 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.585439 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.585457 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.588211 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.590443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.590733 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.590747 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.590788 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.593045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.593333 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.593346 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.593387 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.595690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.595976 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.595990 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.596032 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.596044 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.596666 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.596684 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.596775 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.596788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.596809 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.596823 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.596842 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.596858 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.596906 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.596918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.596937 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352795.596952 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352795.597034 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.214443, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.637347 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.638251 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.638267 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.639096 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.653579 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.654482 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.654498 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.655353 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.655366 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.658306 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.680145 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.680616 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.680633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.681069 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.698955 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.699430 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.699444 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.699878 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.699890 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.706580 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.716746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.717043 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.717057 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.717283 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.725976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.726280 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.726294 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.726519 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.738937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.739247 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.739261 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.739487 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.739500 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.742231 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.748821 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.749153 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.749166 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.749288 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.755512 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.755837 +INFO: TimeDuration, Event = Add_end, Time = 0.000325 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.755849 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.755971 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.763634 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.763961 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.763974 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.764096 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.764114 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.766873 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.769067 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.769352 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.769366 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.769408 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.771690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.771976 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.771990 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.772032 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.774322 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.774608 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.774621 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.774662 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.774675 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.775298 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.775316 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.775407 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.775421 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.775448 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.775460 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.775480 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.775494 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.775541 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.775554 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.775574 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352795.775588 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352795.775671 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.104110, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.815636 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.816537 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.816555 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.817378 +INFO: TimeDuration, Event = Relu_end, Time = 0.000824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.831862 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.832760 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.832777 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.833626 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.833640 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.836586 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.858428 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.858898 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.858916 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.859350 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.876073 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.876549 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.876563 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.877000 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.877013 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.883698 +INFO: TimeDuration, Event = Pool_end, Time = 0.006685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.893880 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.894177 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.894191 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.894417 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.903101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.903406 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.903419 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.903645 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.915727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.916028 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.916044 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.916271 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.916286 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.919019 +INFO: TimeDuration, Event = Pool_end, Time = 0.002734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.925605 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.925942 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.925969 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.926091 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.932310 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.932648 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.932661 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.932782 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.940740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.941072 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.941086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.941209 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.941228 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.943981 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.946208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.946489 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.946503 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.946545 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.948827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.949110 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.949123 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.949166 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.951464 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.951751 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.951765 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.951807 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352795.951821 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352795.952440 +INFO: TimeDuration, Event = Pool_end, Time = 0.000618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.952459 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.952550 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.952564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.952585 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.952598 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.952617 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352795.952631 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352795.952679 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.952692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.952711 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352795.952739 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352795.952821 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.965790, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352795.993235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352795.994136 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352795.994151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352795.994978 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.009476 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.010375 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.010390 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.011241 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.011254 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.014202 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.036046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.036517 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.036533 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.036964 +INFO: TimeDuration, Event = Relu_end, Time = 0.000431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.053700 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.054175 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.054189 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.054624 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.054636 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.061326 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.071489 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.071786 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.071799 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.072024 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.080723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.081029 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.081043 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.081269 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.093331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.093632 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.093647 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.093873 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.093886 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.096626 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.103212 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.103548 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.103562 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.103685 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.109587 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.109920 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.109932 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.110054 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.117714 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.118043 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.118056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.118179 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.118197 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.120955 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.123178 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.123469 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.123482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.123523 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.125800 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.126096 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.126109 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.126151 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.130282 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.130572 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.130585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.130629 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.130641 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.131264 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.131288 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.131387 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.131415 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.131440 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.131453 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.131473 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.131488 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.131545 +INFO: TimeDuration, Event = Mul_end, Time = 0.000056 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.131558 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.131577 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352796.131591 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352796.131725 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000134 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.247539, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.174393 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.175320 +INFO: TimeDuration, Event = Add_end, Time = 0.000927 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.175335 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.176160 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.190662 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.191565 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.191580 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.192433 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.192449 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.195388 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.217231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.217700 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.217719 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.218155 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.234882 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.235356 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.235373 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.235812 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.235826 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.242511 +INFO: TimeDuration, Event = Pool_end, Time = 0.006685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.252688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.252986 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.252999 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.253227 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.262128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.262433 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.262447 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.262675 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.274741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.275043 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.275055 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.275282 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.275295 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.278037 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.284616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.284951 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.284965 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.285086 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.290993 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.291324 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.291337 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.291459 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.299432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.299764 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.299777 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.299899 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.299921 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.302673 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.304878 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.305164 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.305178 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.305220 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.307509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.307796 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.307826 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.307868 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.310141 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.310430 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.310443 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.310484 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.310498 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.311116 +INFO: TimeDuration, Event = Pool_end, Time = 0.000619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.311136 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.311228 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.311241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.311262 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.311295 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.311316 +INFO: TimeDuration, Event = Relu_end, Time = 0.000021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.311330 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.311379 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.311392 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.311411 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352796.311427 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352796.311514 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.896996, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.351461 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.352360 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.352476 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.353294 +INFO: TimeDuration, Event = Relu_end, Time = 0.000818 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.367783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.368679 +INFO: TimeDuration, Event = Add_end, Time = 0.000896 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.368697 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.369551 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.369564 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.372501 +INFO: TimeDuration, Event = Pool_end, Time = 0.002937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.394343 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.394813 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.394829 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.395266 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.412004 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.412479 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.412493 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.412930 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.412943 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.419631 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.429791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.430089 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.430101 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.430328 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.439042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.439346 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.439360 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.439586 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.451655 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.451957 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.451970 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.452195 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.452209 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.454949 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.461534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.461871 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.461885 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.462008 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.468225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.468556 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.468571 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.468694 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.476351 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.476681 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.476695 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.476816 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.476833 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.479592 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.481814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.482099 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.482113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.482155 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.484428 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.484719 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.484734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.484776 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.487038 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.487321 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.487335 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.487377 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.487391 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.488014 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.488032 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.488122 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.488136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.488157 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.488169 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.488189 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.488202 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.488249 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.488262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.488281 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352796.488295 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352796.488388 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000093 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.642443, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.528758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.529666 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.529686 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.530516 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.545003 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.545909 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.545924 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.546784 +INFO: TimeDuration, Event = Relu_end, Time = 0.000860 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.546798 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.549728 +INFO: TimeDuration, Event = Pool_end, Time = 0.002930 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.571568 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.572038 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.572056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.572491 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.589212 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.589687 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.589703 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.590138 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.590151 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.596840 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.607001 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.607299 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.607313 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.607540 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.616225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.616529 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.616543 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.616770 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.628830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.629132 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.629145 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.629371 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.629384 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.632125 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.638723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.639058 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.639071 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.639193 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.645101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.645432 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.645445 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.645567 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.653218 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.653549 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.653564 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.653686 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.653704 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.656460 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.658689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.658974 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.658989 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.659030 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.661309 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.661599 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.661626 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.661669 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.663928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.664212 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.664226 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.664268 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.664281 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.664907 +INFO: TimeDuration, Event = Pool_end, Time = 0.000625 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.664925 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.665015 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.665029 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.665050 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.665062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.665081 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.665096 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.665144 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.665158 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.665177 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352796.665191 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352796.665277 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.290147, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.705183 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.706087 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.706103 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.706932 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.721431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.722332 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.722348 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.723199 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.723212 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.726156 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.748009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.748481 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.748499 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.748935 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.765668 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.766142 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.766155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.766590 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.766603 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.773295 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.783453 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.783750 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.783762 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.783989 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.792676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.792980 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.792994 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.793219 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.805287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.805588 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.805603 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.805830 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.805843 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.808583 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.815164 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.815499 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.815511 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.815633 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.821875 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.822207 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.822220 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.822343 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.829993 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.830321 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.830335 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.830459 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.830477 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.833232 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.835450 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.835736 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.835749 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.835791 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.838082 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.838372 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.838387 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.838429 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.840713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.840997 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.841023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.841066 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.841079 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.841687 +INFO: TimeDuration, Event = Pool_end, Time = 0.000608 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.841706 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.841801 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.841815 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.841836 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.841848 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.841868 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352796.841883 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352796.841932 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.841945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.841964 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352796.841977 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352796.842061 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.639957, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.882068 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.882974 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.882990 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.883816 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.898309 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.899209 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.899225 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.900074 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.900088 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.903044 +INFO: TimeDuration, Event = Pool_end, Time = 0.002956 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.924888 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.925359 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.925377 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.925815 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.942611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.943093 +INFO: TimeDuration, Event = Add_end, Time = 0.000482 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.943107 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.943546 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.943558 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.950240 +INFO: TimeDuration, Event = Pool_end, Time = 0.006682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.960405 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.960705 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.960719 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.960956 +INFO: TimeDuration, Event = Relu_end, Time = 0.000237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.969836 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.970140 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.970154 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.970379 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.982572 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.982874 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.982886 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.983112 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352796.983125 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352796.985865 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.992452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.992786 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.992800 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.992921 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352796.998836 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352796.999166 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352796.999179 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352796.999301 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.007285 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.007617 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.007630 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.007753 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.007772 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.010527 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.012753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.013039 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.013053 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.013095 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.015378 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.015662 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.015676 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.015718 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.018010 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.018293 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.018306 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.018348 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.018361 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.018983 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.019001 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.019096 +INFO: TimeDuration, Event = Mul_end, Time = 0.000094 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.019109 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.019133 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.019145 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.019165 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.019194 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.019246 +INFO: TimeDuration, Event = Mul_end, Time = 0.000052 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.019259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.019279 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352797.019294 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352797.019383 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000089 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.098899, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.060228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.061137 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.061155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.061981 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.076474 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.077375 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.077406 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.078257 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.078270 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.081200 +INFO: TimeDuration, Event = Pool_end, Time = 0.002929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.103042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.103518 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.103534 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.103967 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.120704 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.121179 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.121194 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.121631 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.121645 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.128331 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.138494 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.138793 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.138807 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.139033 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.147718 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.148023 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.148036 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.148260 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.160341 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.160641 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.160654 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.160879 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.160892 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.163629 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.170222 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.170561 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.170573 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.170695 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.176605 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.176937 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.176951 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.177074 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.184728 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.185056 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.185069 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.185191 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.185211 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.187969 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.190203 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.190492 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.190508 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.190550 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.192810 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.193096 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.193111 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.193153 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.195422 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.195706 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.195720 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.195763 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.195775 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.196398 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.196438 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.196530 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.196546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.196567 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.196579 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.196599 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.196612 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.196660 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.196673 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.196692 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352797.196707 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352797.196792 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.305292, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.237023 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.237926 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.237942 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.238769 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.253259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.254164 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.254179 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.255033 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.255046 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.257985 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.279826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.280298 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.280439 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.280874 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.297485 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.297962 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.297975 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.298411 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.298424 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.305115 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.315278 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.315574 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.315587 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.315814 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.324513 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.324818 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.324832 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.325058 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.337128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.337430 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.337443 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.337670 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.337683 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.340421 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.347008 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.347341 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.347354 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.347476 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.353709 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.354043 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.354056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.354178 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.362139 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.362469 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.362482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.362605 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.362622 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.365378 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.367602 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.367887 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.367900 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.367942 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.370238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.370527 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.370541 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.370583 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.372861 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.373141 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.373154 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.373196 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.373208 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.373833 +INFO: TimeDuration, Event = Pool_end, Time = 0.000625 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.373852 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.373942 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.373956 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.373977 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.373990 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.374010 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.374024 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.374071 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.374084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.374103 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352797.374118 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352797.374204 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.920491, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.414056 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.414961 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.414976 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.415804 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.433158 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.434062 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.434078 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.434933 +INFO: TimeDuration, Event = Relu_end, Time = 0.000856 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.434948 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.437887 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.459726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.460196 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.460212 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.460649 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.477359 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.477836 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.477850 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.478284 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.478296 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.484987 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.495158 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.495458 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.495472 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.495697 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.504390 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.504694 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.504708 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.504934 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.517010 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.517312 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.517326 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.517552 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.517566 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.520312 +INFO: TimeDuration, Event = Pool_end, Time = 0.002746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.526898 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.527235 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.527249 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.527374 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.533591 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.533921 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.533934 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.534055 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.541709 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.542040 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.542053 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.542175 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.542195 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.544950 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.547187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.547470 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.547483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.547525 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.549829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.550116 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.550128 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.550171 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.552454 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.552735 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.552748 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.552792 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.552805 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.553429 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.553448 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.553546 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.553559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.553581 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.553605 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.553626 +INFO: TimeDuration, Event = Relu_end, Time = 0.000021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.553640 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.553690 +INFO: TimeDuration, Event = Mul_end, Time = 0.000050 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.553703 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.553723 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352797.553737 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352797.553829 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000093 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 142.537833, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.593816 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.594721 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.594737 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.595563 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.610059 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.610962 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.610977 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.611827 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.611840 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.614787 +INFO: TimeDuration, Event = Pool_end, Time = 0.002947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.636635 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.637111 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.637127 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.637560 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.654299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.654775 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.654790 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.655223 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.655236 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.661915 +INFO: TimeDuration, Event = Pool_end, Time = 0.006678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.672089 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.672388 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.672434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.672660 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.681315 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.681621 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.681633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.681859 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.693928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.694229 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.694242 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.694469 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.694494 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.697226 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.703808 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.704147 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.704159 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.704282 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.710191 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.710523 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.710537 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.710659 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.718321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.718651 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.718684 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.718806 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.718825 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.721564 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.723779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.724066 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.724080 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.724123 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.726406 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.726696 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.726709 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.726751 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.729038 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.729323 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.729337 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.729380 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.729393 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.730013 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.730032 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.730122 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.730136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.730157 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.730170 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.730189 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.730204 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.730252 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.730266 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.730285 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352797.730299 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352797.730385 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.308043, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.770441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.771343 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.771358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.772183 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.786674 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.787582 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.787598 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.788453 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.788467 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.791400 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.813241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.813712 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.813730 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.814167 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.830893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.831368 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.831382 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.831819 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.831832 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.838519 +INFO: TimeDuration, Event = Pool_end, Time = 0.006687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.848699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.849016 +INFO: TimeDuration, Event = Add_end, Time = 0.000317 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.849031 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.849257 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.857931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.858236 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.858250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.858477 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.870547 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.870848 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.870863 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.871090 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.871103 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.873842 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.880434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.880769 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.880782 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.880905 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.887123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.887456 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.887470 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.887592 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.895552 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.895882 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.895895 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.896095 +INFO: TimeDuration, Event = Relu_end, Time = 0.000200 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.896113 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.900742 +INFO: TimeDuration, Event = Pool_end, Time = 0.004628 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.902630 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.902924 +INFO: TimeDuration, Event = Add_end, Time = 0.000294 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.902937 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.902980 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.905271 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.905557 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.905570 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.905612 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.907908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.908193 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.908208 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.908250 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.908264 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.908881 +INFO: TimeDuration, Event = Pool_end, Time = 0.000618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.908900 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.908995 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.909009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.909036 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.909048 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.909068 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352797.909082 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352797.909130 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.909143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.909165 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352797.909179 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352797.909269 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000089 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.596407, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.949401 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.950314 +INFO: TimeDuration, Event = Add_end, Time = 0.000913 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.950331 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.951152 +INFO: TimeDuration, Event = Relu_end, Time = 0.000822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.965640 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.966545 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.966560 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.967415 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352797.967428 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352797.970365 +INFO: TimeDuration, Event = Pool_end, Time = 0.002937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352797.992207 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352797.992680 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352797.992698 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352797.993133 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.009838 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.010314 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.010327 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.010762 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.010774 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.017463 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.027624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.027921 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.027946 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.028174 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.036849 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.037153 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.037178 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.037410 +INFO: TimeDuration, Event = Relu_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.049467 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.049768 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.049783 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.050009 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.050023 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.052761 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.059353 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.059690 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.059704 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.059826 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.065736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.066069 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.066082 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.066204 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.074168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.074497 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.074510 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.074653 +INFO: TimeDuration, Event = Relu_end, Time = 0.000143 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.074675 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.077407 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.079595 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.079880 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.079893 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.079935 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.082232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.082520 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.082532 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.082574 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.084870 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.085158 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.085171 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.085213 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.085227 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.085844 +INFO: TimeDuration, Event = Pool_end, Time = 0.000617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.085863 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.085953 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.085967 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.085989 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.086001 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.086020 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.086035 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.086082 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.086097 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.086116 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352798.086131 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352798.086218 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.582527, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.126338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.127240 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.127257 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.128084 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.142564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.143467 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.143483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.144334 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.144346 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.148351 +INFO: TimeDuration, Event = Pool_end, Time = 0.004006 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.170181 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.170651 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.170667 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.171106 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.186823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.187297 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.187311 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.187752 +INFO: TimeDuration, Event = Relu_end, Time = 0.000441 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.187766 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.194450 +INFO: TimeDuration, Event = Pool_end, Time = 0.006684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.205561 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.205859 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.205872 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.206099 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.214785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.215089 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.215102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.215328 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.227394 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.227695 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.227710 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.227935 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.227947 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.230689 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.237274 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.237611 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.237623 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.237744 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.243662 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.243994 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.244007 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.244128 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.251778 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.252106 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.252119 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.252241 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.252259 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.255020 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.257213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.257499 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.257512 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.257555 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.259842 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.260130 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.260143 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.260185 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.262494 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.262779 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.262792 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.262836 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.262849 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.263470 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.263491 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.263582 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.263596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.263617 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.263631 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.263650 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.263665 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.263713 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.263727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.263746 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352798.263789 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352798.263876 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.307533, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.304025 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.304927 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.304945 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.305775 +INFO: TimeDuration, Event = Relu_end, Time = 0.000829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.320250 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.321156 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.321175 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.322024 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.322039 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.324976 +INFO: TimeDuration, Event = Pool_end, Time = 0.002937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.347868 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.348341 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.348480 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.348914 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.364507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.364983 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.364997 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.365434 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.365447 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.372137 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.382304 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.382603 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.382616 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.382843 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.391542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.391846 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.391870 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.392097 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.404150 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.404464 +INFO: TimeDuration, Event = Add_end, Time = 0.000314 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.404479 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.404703 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.404715 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.408523 +INFO: TimeDuration, Event = Pool_end, Time = 0.003808 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.414125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.414463 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.414476 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.414598 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.420518 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.420851 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.420876 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.421002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.428684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.429014 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.429038 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.429162 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.429180 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.431925 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.434160 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.434447 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.434461 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.434503 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.438287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.438575 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.438590 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.438634 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.440915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.441202 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.441216 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.441258 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.441283 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.441900 +INFO: TimeDuration, Event = Pool_end, Time = 0.000617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.441918 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.442014 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.442027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.442050 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.442062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.442081 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.442096 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.442153 +INFO: TimeDuration, Event = Mul_end, Time = 0.000057 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.442166 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.442187 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352798.442200 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352798.442343 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000144 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.035064, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.484982 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.485875 +INFO: TimeDuration, Event = Add_end, Time = 0.000893 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.485891 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.486716 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.501209 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.502106 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.502122 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.502975 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.502989 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.505934 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.527774 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.528244 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.528261 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.528695 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.546678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.547154 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.547168 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.547603 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.547616 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.554304 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.564477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.564775 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.564789 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.565015 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.573899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.574203 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.574216 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.574441 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.586511 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.586812 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.586825 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.587052 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.587064 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.589805 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.596395 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.596729 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.596741 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.596864 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.603082 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.603416 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.603430 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.603552 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.611202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.611536 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.611551 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.611672 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.611691 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.614445 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.616659 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.616939 +INFO: TimeDuration, Event = Add_end, Time = 0.000280 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.616953 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.616995 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.619286 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.619577 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.619590 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.619633 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.621947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.622233 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.622245 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.622286 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.622300 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.622922 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.622940 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.623031 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.623046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.623068 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.623079 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.623098 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.623113 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.623161 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.623174 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.623193 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352798.623207 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352798.623293 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.065868, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.663387 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.664295 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.664477 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.665286 +INFO: TimeDuration, Event = Relu_end, Time = 0.000809 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.679772 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.680671 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.680689 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.681545 +INFO: TimeDuration, Event = Relu_end, Time = 0.000856 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.681559 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.684496 +INFO: TimeDuration, Event = Pool_end, Time = 0.002937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.706337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.706807 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.706822 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.707257 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.723983 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.724460 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.724474 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.724911 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.724925 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.731612 +INFO: TimeDuration, Event = Pool_end, Time = 0.006687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.741784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.742081 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.742096 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.742322 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.751009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.751313 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.751328 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.751553 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.763643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.763944 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.763959 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.764186 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.764198 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.766923 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.773507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.773842 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.773855 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.773977 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.780197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.780531 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.780545 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.780668 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.788639 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.788972 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.788985 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.789107 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.789124 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.791875 +INFO: TimeDuration, Event = Pool_end, Time = 0.002751 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.794090 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.794373 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.794386 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.794428 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.796698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.796985 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.796999 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.797041 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.799327 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.799613 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.799625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.799667 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.799680 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.800307 +INFO: TimeDuration, Event = Pool_end, Time = 0.000627 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.800323 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.800415 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.800430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.800451 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.800464 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.800484 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.800497 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.800545 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.800559 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.800578 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352798.800604 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352798.800693 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.908975, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.840694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.841597 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.841611 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.842438 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.856946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.857847 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.857864 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.858717 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.858729 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.861673 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.883513 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.883986 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.884002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.884439 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.901155 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.901630 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.901644 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.902077 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.902091 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.908782 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.918946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.919242 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.919256 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.919481 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.928181 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.928487 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.928500 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.928727 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.940788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.941089 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.941102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.941331 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.941345 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.944080 +INFO: TimeDuration, Event = Pool_end, Time = 0.002735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.950669 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.951002 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.951028 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.951148 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.957367 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.957699 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.957712 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.957834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.965632 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.965962 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.965976 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.966099 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.966117 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.968870 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.971087 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.971373 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.971386 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.971428 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.973712 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.973998 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.974013 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.974055 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.976344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.976629 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.976641 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.976683 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352798.976695 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352798.977318 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.977335 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.977425 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.977441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.977462 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352798.977474 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352798.977494 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352798.977508 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352798.977580 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352798.977594 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352798.977614 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352798.977629 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352798.977714 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.798430, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.020485 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.021387 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.021403 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.022225 +INFO: TimeDuration, Event = Relu_end, Time = 0.000821 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.036783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.037680 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.037696 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.038553 +INFO: TimeDuration, Event = Relu_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.038568 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.041507 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.063365 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.063838 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.063855 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.064291 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.080997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.081472 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.081486 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.081921 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.081933 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.088623 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.098795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.099093 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.099106 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.099333 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.108030 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.108336 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.108434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.108659 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.120641 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.120942 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.120955 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.121181 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.121193 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.123934 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.130527 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.130864 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.130876 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.131001 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.137217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.137549 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.137562 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.137684 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.145648 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.145978 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.145991 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.146112 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.146130 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.148889 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.151101 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.151386 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.151399 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.151441 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.153712 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.153999 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.154011 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.154053 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.156321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.156605 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.156617 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.156660 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.156672 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.157290 +INFO: TimeDuration, Event = Pool_end, Time = 0.000618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.157309 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.157403 +INFO: TimeDuration, Event = Mul_end, Time = 0.000094 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.157417 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.157439 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.157452 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.157471 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.157487 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.157534 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.157549 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.157570 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352799.157584 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352799.157672 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.963209, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.197481 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.198386 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.198401 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.199229 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.213706 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.214604 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.214620 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.215471 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.215496 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.218433 +INFO: TimeDuration, Event = Pool_end, Time = 0.002938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.240275 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.240746 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.240767 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.241203 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.257945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.258420 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.258434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.258873 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.258887 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.265571 +INFO: TimeDuration, Event = Pool_end, Time = 0.006684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.275732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.276029 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.276044 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.276270 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.284953 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.285257 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.285270 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.285499 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.297555 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.297855 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.297870 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.298097 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.298123 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.300849 +INFO: TimeDuration, Event = Pool_end, Time = 0.002727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.307446 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.307780 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.307793 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.307915 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.314140 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.314478 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.314492 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.314614 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.322258 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.322589 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.322602 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.322724 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.322742 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.325499 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.327708 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.327992 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.328006 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.328048 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.330322 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.330615 +INFO: TimeDuration, Event = Add_end, Time = 0.000293 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.330629 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.330671 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.332951 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.333237 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.333250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.333292 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.333306 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.333927 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.333946 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.334037 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.334051 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.334073 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.334086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.334106 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.334120 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.334167 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.334181 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.334200 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352799.334215 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352799.334302 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.562944, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.373973 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.374878 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.374892 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.375719 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.390209 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.391114 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.391130 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.391983 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.391996 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.394935 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.416773 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.417244 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.417260 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.417696 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.434405 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.434881 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.434895 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.435329 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.435341 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.442033 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.452196 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.452493 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.452506 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.452732 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.461424 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.461729 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.461744 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.461969 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.474040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.474342 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.474356 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.474584 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.474597 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.477335 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.483917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.484250 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.484262 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.484385 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.490315 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.490649 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.490663 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.490787 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.498747 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.499077 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.499089 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.499212 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.499229 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.501988 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.504202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.504486 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.504502 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.504544 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.506857 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.507143 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.507157 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.507199 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.509492 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.509776 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.509790 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.509833 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.509845 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.510467 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.510486 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.510576 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.510590 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.510611 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.510623 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.510643 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.510657 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.510704 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.510717 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.510736 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352799.510749 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352799.510837 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.598891, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.550858 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.551761 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.551776 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.552598 +INFO: TimeDuration, Event = Relu_end, Time = 0.000822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.567084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.567989 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.568005 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.568858 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.568872 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.571809 +INFO: TimeDuration, Event = Pool_end, Time = 0.002937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.593650 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.594121 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.594138 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.594573 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.611303 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.611778 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.611792 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.612229 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.612242 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.618930 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.629090 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.629386 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.629400 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.629625 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.638327 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.638631 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.638645 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.638876 +INFO: TimeDuration, Event = Relu_end, Time = 0.000231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.650948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.651250 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.651263 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.651488 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.651500 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.654242 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.660834 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.661172 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.661185 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.661307 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.667519 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.667847 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.667861 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.667982 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.675965 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.676295 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.676314 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.676436 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.676453 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.679206 +INFO: TimeDuration, Event = Pool_end, Time = 0.002752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.681415 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.681702 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.681716 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.681759 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.684031 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.684320 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.684434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.684477 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.686662 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.686947 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.686960 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.687002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.687014 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.687636 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.687655 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.687746 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.687762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.687783 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.687796 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.687840 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.687856 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.687905 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.687918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.687938 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352799.687952 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352799.688040 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.886460, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.727917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.728819 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.728837 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.729663 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.744136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.745039 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.745056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.745906 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.745921 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.748864 +INFO: TimeDuration, Event = Pool_end, Time = 0.002943 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.770706 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.771175 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.771190 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.771621 +INFO: TimeDuration, Event = Relu_end, Time = 0.000430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.789926 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.790402 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.790417 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.790856 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.790868 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.797551 +INFO: TimeDuration, Event = Pool_end, Time = 0.006683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.809431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.809729 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.809743 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.809970 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.818653 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.818957 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.818969 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.819195 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.831259 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.831560 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.831573 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.831799 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.831812 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.834552 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.841143 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.841477 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.841489 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.841610 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.847529 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.847861 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.847874 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.847995 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.855654 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.855984 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.855996 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.856119 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.856138 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.858896 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.861100 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.861384 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.861398 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.861440 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.863716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.864004 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.864017 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.864059 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.866339 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.866622 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.866636 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.866678 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.866691 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.867314 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.867347 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.867439 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.867453 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.867477 +INFO: TimeDuration, Event = Add_end, Time = 0.000025 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.867491 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.867510 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352799.867525 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352799.867573 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.867586 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.867606 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352799.867620 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352799.867706 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 142.548565, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.908038 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.908947 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.908963 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.909794 +INFO: TimeDuration, Event = Relu_end, Time = 0.000831 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.924282 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.925183 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.925199 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.926047 +INFO: TimeDuration, Event = Relu_end, Time = 0.000848 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.926060 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.929008 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.951912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.952388 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.952404 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.952837 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.968563 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.969037 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.969050 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.969485 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352799.969511 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352799.976177 +INFO: TimeDuration, Event = Pool_end, Time = 0.006667 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.986344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.986642 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.986656 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.986882 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352799.995567 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352799.995871 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352799.995884 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352799.996112 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.008296 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.008637 +INFO: TimeDuration, Event = Add_end, Time = 0.000342 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.008652 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.008877 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.008888 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.012644 +INFO: TimeDuration, Event = Pool_end, Time = 0.003755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.018270 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.018604 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.018617 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.018739 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.024704 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.025037 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.025050 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.025172 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.032829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.033160 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.033172 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.033295 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.033313 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.036067 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.038295 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.038579 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.038592 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.038634 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.040916 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.041203 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.041216 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.041258 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.043548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.043832 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.043844 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.043886 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.043899 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.044522 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.044541 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.044635 +INFO: TimeDuration, Event = Mul_end, Time = 0.000094 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.044648 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.044672 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.044684 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.044703 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.044717 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.044765 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.044777 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.044797 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352800.044810 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352800.044899 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000089 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.664704, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.084788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.085686 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.085703 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.086529 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.101011 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.101913 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.101928 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.102778 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.102792 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.105733 +INFO: TimeDuration, Event = Pool_end, Time = 0.002941 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.127589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.128059 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.128075 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.128513 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.145219 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.145694 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.145708 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.146143 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.146155 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.152846 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.163006 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.163303 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.163317 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.163543 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.172239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.172547 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.172561 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.172786 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.184851 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.185152 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.185166 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.185393 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.185406 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.188146 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.194893 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.195226 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.195238 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.195359 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.201611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.201941 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.201955 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.202078 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.209743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.210074 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.210088 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.210210 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.210227 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.212982 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.215184 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.215472 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.215485 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.215527 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.217808 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.218093 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.218105 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.218147 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.220464 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.220746 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.220760 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.220803 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.220828 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.221439 +INFO: TimeDuration, Event = Pool_end, Time = 0.000611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.221457 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.221548 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.221563 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.221585 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.221598 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.221618 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.221632 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.221680 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.221693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.221712 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352800.221726 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352800.221814 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.766878, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.261877 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.262780 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.262797 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.263621 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.278105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.279007 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.279023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.279876 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.279889 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.282833 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.304698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.305172 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.305188 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.305621 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.322343 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.322824 +INFO: TimeDuration, Event = Add_end, Time = 0.000480 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.322837 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.323268 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.323280 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.329970 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.340464 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.340780 +INFO: TimeDuration, Event = Add_end, Time = 0.000316 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.340795 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.341036 +INFO: TimeDuration, Event = Relu_end, Time = 0.000240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.349784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.350091 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.350106 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.350332 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.363413 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.363788 +INFO: TimeDuration, Event = Add_end, Time = 0.000375 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.363830 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.364077 +INFO: TimeDuration, Event = Relu_end, Time = 0.000247 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.364106 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.367710 +INFO: TimeDuration, Event = Pool_end, Time = 0.003604 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.373533 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.373877 +INFO: TimeDuration, Event = Add_end, Time = 0.000343 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.373893 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.374021 +INFO: TimeDuration, Event = Relu_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.380443 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.380783 +INFO: TimeDuration, Event = Add_end, Time = 0.000340 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.380800 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.380937 +INFO: TimeDuration, Event = Relu_end, Time = 0.000137 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.389205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.389559 +INFO: TimeDuration, Event = Add_end, Time = 0.000354 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.389578 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.389707 +INFO: TimeDuration, Event = Relu_end, Time = 0.000129 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.389733 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.392458 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.394791 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.395086 +INFO: TimeDuration, Event = Add_end, Time = 0.000294 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.395102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.395148 +INFO: TimeDuration, Event = Relu_end, Time = 0.000046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.397779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.398070 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.398086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.398132 +INFO: TimeDuration, Event = Relu_end, Time = 0.000046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.400524 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.400814 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.400830 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.400875 +INFO: TimeDuration, Event = Relu_end, Time = 0.000045 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.400891 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.401505 +INFO: TimeDuration, Event = Pool_end, Time = 0.000614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.401529 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.401675 +INFO: TimeDuration, Event = Mul_end, Time = 0.000146 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.401693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.401720 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.401734 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.401757 +INFO: TimeDuration, Event = Relu_end, Time = 0.000022 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.401774 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.401837 +INFO: TimeDuration, Event = Mul_end, Time = 0.000063 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.401853 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.401875 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352800.401892 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352800.401987 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000095 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 142.794275, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.448229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.449219 +INFO: TimeDuration, Event = Add_end, Time = 0.000990 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.449258 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.450096 +INFO: TimeDuration, Event = Relu_end, Time = 0.000837 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.467239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.468142 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.468175 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.469031 +INFO: TimeDuration, Event = Relu_end, Time = 0.000856 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.469048 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.471971 +INFO: TimeDuration, Event = Pool_end, Time = 0.002923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.493806 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.494277 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.494296 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.494728 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.511509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.511989 +INFO: TimeDuration, Event = Add_end, Time = 0.000480 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.512002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.512439 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.512453 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.519139 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.529303 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.529601 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.529618 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.529843 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.538707 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.539012 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.539026 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.539252 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.551316 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.551619 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.551632 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.551859 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.551871 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.554608 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.561197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.561530 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.561543 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.561665 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.567584 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.567916 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.567929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.568051 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.576717 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.577049 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.577063 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.577184 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.577204 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.579955 +INFO: TimeDuration, Event = Pool_end, Time = 0.002752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.582181 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.582469 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.582482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.582524 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.584796 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.585085 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.585098 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.585140 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.587424 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.587704 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.587717 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.587759 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.587773 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.588397 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.588439 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.588543 +INFO: TimeDuration, Event = Mul_end, Time = 0.000105 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.588558 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.588582 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.588595 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.588615 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.588630 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.588684 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.588698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.588717 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352800.588731 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352800.588817 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 146.451337, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.644944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.645847 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.645863 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.646689 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.661204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.662105 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.662121 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.662971 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.662986 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.665930 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.687819 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.688289 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.688339 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.688771 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.705435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.705909 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.705922 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.706360 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.706372 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.713061 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.723237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.723534 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.723547 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.723773 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.732475 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.732780 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.732793 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.733019 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.747586 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.747942 +INFO: TimeDuration, Event = Add_end, Time = 0.000356 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.747961 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.748188 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.748203 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.751570 +INFO: TimeDuration, Event = Pool_end, Time = 0.003368 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.755131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.755472 +INFO: TimeDuration, Event = Add_end, Time = 0.000341 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.755485 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.755608 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.761531 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.761864 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.761876 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.761999 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.769968 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.770299 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.770312 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.770433 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.770451 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.773210 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.775426 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.775711 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.775723 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.775765 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.778070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.778355 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.778369 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.778410 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.780700 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.780986 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.781013 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.781056 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.781070 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.781676 +INFO: TimeDuration, Event = Pool_end, Time = 0.000606 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.781694 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.781785 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.781799 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.781828 +INFO: TimeDuration, Event = Add_end, Time = 0.000029 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.781840 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.781860 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.781876 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.781924 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.781937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.781957 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352800.781972 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352800.782055 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.853451, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.822019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.822930 +INFO: TimeDuration, Event = Add_end, Time = 0.000911 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.822947 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.823777 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.838305 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.839203 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.839218 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.840070 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.840082 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.843027 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.864867 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.865339 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.865355 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.865790 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.882535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.883011 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.883023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.883460 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.883472 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.890162 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.900440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.900737 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.900750 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.900975 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.909603 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.909911 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.909924 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.910149 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.922239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.922540 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.922553 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.922779 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.922791 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.925533 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.932115 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.932450 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.932466 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.932589 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.938809 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.939138 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.939153 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.939275 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.946936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.947266 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.947280 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.947403 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.947420 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.950176 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.952439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.952723 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.952736 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.952778 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.954985 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.955275 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.955287 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.955329 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.957597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.957881 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.957894 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.957936 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352800.957948 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352800.958573 +INFO: TimeDuration, Event = Pool_end, Time = 0.000625 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.958591 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.958682 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.958696 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.958717 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352800.958729 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352800.958749 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352800.958762 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352800.958810 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.958823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352800.958842 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352800.958855 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352800.958935 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000080 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.502303, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352800.999448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.000358 +INFO: TimeDuration, Event = Add_end, Time = 0.000910 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.000371 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.001202 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.015722 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.016624 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.016641 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.017500 +INFO: TimeDuration, Event = Relu_end, Time = 0.000859 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.017513 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.020447 +INFO: TimeDuration, Event = Pool_end, Time = 0.002935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.042286 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.042757 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.042773 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.043208 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.059934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.060410 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.060422 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.060857 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.060871 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.067563 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.077731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.078028 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.078042 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.078268 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.086968 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.087271 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.087286 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.087511 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.099580 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.099881 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.099894 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.100120 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.100131 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.102874 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.109467 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.109802 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.109817 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.109938 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.115876 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.116207 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.116220 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.116343 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.124319 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.124651 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.124664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.124786 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.124804 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.127552 +INFO: TimeDuration, Event = Pool_end, Time = 0.002748 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.129777 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.130066 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.130079 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.130121 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.132448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.132738 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.132752 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.132794 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.135080 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.135361 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.135375 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.135418 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.135430 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.136056 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.136098 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.136192 +INFO: TimeDuration, Event = Mul_end, Time = 0.000094 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.136206 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.136226 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.136238 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.136258 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.136274 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.136322 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.136434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.136456 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352801.136470 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352801.136553 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.796932, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.176587 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.177490 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.177507 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.178330 +INFO: TimeDuration, Event = Relu_end, Time = 0.000823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.192811 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.193707 +INFO: TimeDuration, Event = Add_end, Time = 0.000896 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.193723 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.194572 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.194586 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.197536 +INFO: TimeDuration, Event = Pool_end, Time = 0.002949 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.219378 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.219851 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.219868 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.220301 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.237052 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.237526 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.237539 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.237974 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.237986 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.244678 +INFO: TimeDuration, Event = Pool_end, Time = 0.006693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.254853 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.255151 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.255165 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.255390 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.264084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.264389 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.264400 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.264627 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.276706 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.277007 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.277021 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.277247 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.277260 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.279998 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.286594 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.286933 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.286946 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.287067 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.292976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.293311 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.293323 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.293446 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.301098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.301429 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.301441 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.301563 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.301580 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.304337 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.306553 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.306840 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.306853 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.306896 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.309200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.309488 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.309503 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.309547 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.311855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.312142 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.312155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.312198 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.312210 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.312831 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.312850 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.312941 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.312956 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.312977 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.312989 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.313009 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.313024 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.313072 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.313084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.313104 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352801.313117 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352801.313198 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.357356, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.353618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.354511 +INFO: TimeDuration, Event = Add_end, Time = 0.000893 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.354527 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.355352 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.369882 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.370778 +INFO: TimeDuration, Event = Add_end, Time = 0.000896 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.370793 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.371643 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.371657 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.374608 +INFO: TimeDuration, Event = Pool_end, Time = 0.002951 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.397500 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.397990 +INFO: TimeDuration, Event = Add_end, Time = 0.000490 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.398006 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.398440 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.415380 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.415855 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.415868 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.416302 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.416437 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.423048 +INFO: TimeDuration, Event = Pool_end, Time = 0.006610 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.433183 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.433483 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.433503 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.433733 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.442584 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.442890 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.442904 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.443130 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.455201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.455502 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.455515 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.455742 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.455756 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.458500 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.465077 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.465410 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.465423 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.465546 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.471460 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.471797 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.471809 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.471930 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.480586 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.480918 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.480933 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.481055 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.481076 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.483825 +INFO: TimeDuration, Event = Pool_end, Time = 0.002750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.486060 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.486347 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.486360 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.486402 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.488679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.488969 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.488983 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.489025 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.491333 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.491617 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.491631 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.491672 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.491686 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.492311 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.492328 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.492424 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.492439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.492462 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.492476 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.492496 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.492510 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.492562 +INFO: TimeDuration, Event = Mul_end, Time = 0.000052 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.492577 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.492596 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352801.492612 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352801.492759 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000148 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.740024, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.535538 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.536445 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.536479 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.537303 +INFO: TimeDuration, Event = Relu_end, Time = 0.000824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.551787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.552694 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.552712 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.553576 +INFO: TimeDuration, Event = Relu_end, Time = 0.000864 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.553588 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.556511 +INFO: TimeDuration, Event = Pool_end, Time = 0.002922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.578356 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.578827 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.578843 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.579275 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.596511 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.596986 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.597000 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.597434 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.597448 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.604132 +INFO: TimeDuration, Event = Pool_end, Time = 0.006684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.614305 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.614603 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.614641 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.614868 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.623698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.624008 +INFO: TimeDuration, Event = Add_end, Time = 0.000310 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.624024 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.624250 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.636309 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.636613 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.636625 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.636851 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.636864 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.639596 +INFO: TimeDuration, Event = Pool_end, Time = 0.002732 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.646191 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.646545 +INFO: TimeDuration, Event = Add_end, Time = 0.000354 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.646560 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.646682 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.652884 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.653215 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.653229 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.653351 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.661021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.661351 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.661365 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.661487 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.661504 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.664260 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.666456 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.666747 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.666759 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.666801 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.669070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.669358 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.669371 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.669413 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.671683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.671967 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.671980 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.672023 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.672036 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.672658 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.672676 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.672767 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.672780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.672801 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.672812 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.672832 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.672845 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.672893 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.672905 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.672925 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352801.672940 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352801.673027 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.192971, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.714491 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.715394 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.715409 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.716234 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.730786 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.731684 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.731702 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.732552 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.732567 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.735512 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.758409 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.758880 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.758897 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.759331 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.775032 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.775508 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.775521 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.775954 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.775979 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.783715 +INFO: TimeDuration, Event = Pool_end, Time = 0.007736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.792991 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.793290 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.793303 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.793529 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.802402 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.802709 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.802722 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.802949 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.814996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.815297 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.815334 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.815561 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.815574 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.819343 +INFO: TimeDuration, Event = Pool_end, Time = 0.003769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.824977 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.825313 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.825326 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.825449 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.831357 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.831692 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.831705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.831828 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.839475 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.839806 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.839818 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.839939 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.839957 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.842717 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.844933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.845219 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.845232 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.845274 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.847535 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.847829 +INFO: TimeDuration, Event = Add_end, Time = 0.000294 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.847843 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.847885 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.850171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.850458 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.850475 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.850518 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.850556 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.851147 +INFO: TimeDuration, Event = Pool_end, Time = 0.000591 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.851168 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.851259 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.851273 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.851294 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.851307 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.851327 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352801.851342 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352801.851390 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.851404 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.851424 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352801.851438 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352801.851521 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.792561, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.891839 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.892741 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.892759 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.893586 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.908062 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.908966 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.908983 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.909834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.909848 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.912789 +INFO: TimeDuration, Event = Pool_end, Time = 0.002941 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.934628 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.935099 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.935114 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.935548 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.952283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.952761 +INFO: TimeDuration, Event = Add_end, Time = 0.000478 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.952777 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.953212 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.953224 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.959910 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.970086 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.970384 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.970398 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.970623 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.979313 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.979619 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.979631 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.979857 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352801.991925 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352801.992227 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352801.992240 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352801.992469 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352801.992481 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352801.995220 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.001811 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.002146 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.002160 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.002281 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.008190 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.008522 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.008536 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.008660 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.016628 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.016956 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.016968 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.017090 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.017108 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.019869 +INFO: TimeDuration, Event = Pool_end, Time = 0.002761 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.022109 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.022394 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.022408 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.022450 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.024727 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.025014 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.025027 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.025069 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.027357 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.027644 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.027659 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.027701 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.027713 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.028349 +INFO: TimeDuration, Event = Pool_end, Time = 0.000635 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.028365 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.028457 +INFO: TimeDuration, Event = Mul_end, Time = 0.000093 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.028473 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.028495 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.028508 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.028528 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.028543 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.028591 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.028604 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.028623 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352802.028638 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352802.028721 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.660540, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.068901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.069807 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.069823 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.070648 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.085161 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.086069 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.086086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.086941 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.086955 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.089883 +INFO: TimeDuration, Event = Pool_end, Time = 0.002928 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.111725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.112195 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.112212 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.112647 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.129364 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.129839 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.129852 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.130288 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.130300 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.136990 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.147160 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.147457 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.147470 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.147696 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.156387 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.156690 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.156704 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.156930 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.168990 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.169291 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.169306 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.169534 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.169546 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.172283 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.178874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.179207 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.179219 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.179341 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.185574 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.185909 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.185921 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.186044 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.193693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.194021 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.194035 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.194156 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.194176 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.196936 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.199149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.199434 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.199447 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.199488 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.201776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.202067 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.202081 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.202123 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.204440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.204728 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.204740 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.204782 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.204796 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.205416 +INFO: TimeDuration, Event = Pool_end, Time = 0.000620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.205434 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.205525 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.205541 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.205563 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.205575 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.205594 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.205608 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.205655 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.205669 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.205688 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352802.205702 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352802.205786 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.639849, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.246272 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.247174 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.247190 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.248018 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.262562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.263466 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.263482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.264333 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.264437 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.267287 +INFO: TimeDuration, Event = Pool_end, Time = 0.002849 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.289127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.289596 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.289614 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.290046 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.306821 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.307301 +INFO: TimeDuration, Event = Add_end, Time = 0.000480 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.307315 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.307756 +INFO: TimeDuration, Event = Relu_end, Time = 0.000440 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.307769 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.314444 +INFO: TimeDuration, Event = Pool_end, Time = 0.006675 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.324620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.324917 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.324930 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.325156 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.333848 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.334152 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.334167 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.334393 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.346466 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.346767 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.346782 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.347008 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.347021 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.349760 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.356356 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.356693 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.356707 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.356829 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.362735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.363066 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.363079 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.363202 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.370854 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.371183 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.371196 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.371318 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.371352 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.374097 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.376440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.376729 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.376742 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.376784 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.378947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.379234 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.379246 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.379288 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.381592 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.381876 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.381890 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.381932 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.381946 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.382566 +INFO: TimeDuration, Event = Pool_end, Time = 0.000620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.382584 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.382675 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.382690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.382712 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.382724 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.382745 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.382760 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.382812 +INFO: TimeDuration, Event = Mul_end, Time = 0.000052 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.382840 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.382861 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352802.382876 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352802.382959 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.259984, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.423331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.424229 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.424244 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.425076 +INFO: TimeDuration, Event = Relu_end, Time = 0.000832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.439553 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.440454 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.440472 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.441325 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.441339 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.444278 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.466119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.466589 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.466604 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.467039 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.484020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.484504 +INFO: TimeDuration, Event = Add_end, Time = 0.000484 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.484519 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.484958 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.484970 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.491648 +INFO: TimeDuration, Event = Pool_end, Time = 0.006678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.501819 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.502117 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.502131 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.502358 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.511043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.511348 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.511362 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.511588 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.523649 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.523950 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.523962 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.524187 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.524202 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.526944 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.533689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.534026 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.534038 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.534161 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.540398 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.540728 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.540754 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.540877 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.548561 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.548889 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.548903 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.549024 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.549043 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.551803 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.554014 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.554299 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.554312 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.554353 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.556616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.556898 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.556912 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.556954 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.559272 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.559556 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.559568 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.559611 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.559624 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.560249 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.560266 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.560360 +INFO: TimeDuration, Event = Mul_end, Time = 0.000093 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.560434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.560461 +INFO: TimeDuration, Event = Add_end, Time = 0.000027 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.560474 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.560493 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.560508 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.560562 +INFO: TimeDuration, Event = Mul_end, Time = 0.000054 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.560575 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.560594 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352802.560608 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352802.560698 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000090 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.128704, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.600996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.601909 +INFO: TimeDuration, Event = Add_end, Time = 0.000914 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.601926 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.602752 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.617244 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.618147 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.618163 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.619015 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.619029 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.621963 +INFO: TimeDuration, Event = Pool_end, Time = 0.002934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.643809 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.644280 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.644296 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.644734 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.662507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.662983 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.662997 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.663432 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.663445 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.670133 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.680299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.680603 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.680616 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.680842 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.689726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.690030 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.690042 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.690267 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.702340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.702641 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.702654 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.702880 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.702893 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.705634 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.712217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.712550 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.712566 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.712688 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.718935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.719268 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.719283 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.719405 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.727051 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.727381 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.727394 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.727515 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.727534 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.730293 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.732501 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.732789 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.732802 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.732844 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.735129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.735420 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.735434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.735476 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.737814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.738098 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.738111 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.738153 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.738165 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.738786 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.738807 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.738899 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.738914 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.738935 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.738947 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.738966 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.738979 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.739031 +INFO: TimeDuration, Event = Mul_end, Time = 0.000052 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.739044 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.739063 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352802.739077 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352802.739207 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000130 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.980088, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.779632 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.780535 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.780552 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.781378 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.797749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.798652 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.798667 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.799520 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.799546 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.802475 +INFO: TimeDuration, Event = Pool_end, Time = 0.002929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.824314 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.824785 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.824802 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.825238 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.841974 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.842449 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.842462 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.842896 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.842911 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.849600 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.859766 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.860063 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.860076 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.860307 +INFO: TimeDuration, Event = Relu_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.868996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.869300 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.869313 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.869540 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.881607 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.881907 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.881920 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.882146 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.882158 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.884902 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.891485 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.891822 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.891835 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.891956 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.897873 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.898209 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.898224 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.898347 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.905994 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.906322 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.906335 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.906456 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.906476 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.909237 +INFO: TimeDuration, Event = Pool_end, Time = 0.002761 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.911461 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.911748 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.911762 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.911804 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.914093 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.914380 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.914394 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.914437 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.916713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.916997 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.917009 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.917051 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.917065 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.917688 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.917723 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.917821 +INFO: TimeDuration, Event = Mul_end, Time = 0.000099 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.917848 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.917870 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.917881 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.917901 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352802.917915 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352802.917967 +INFO: TimeDuration, Event = Mul_end, Time = 0.000051 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.917981 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.918001 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352802.918025 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352802.918113 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000089 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.237154, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.958111 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.959022 +INFO: TimeDuration, Event = Add_end, Time = 0.000911 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.959036 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.959865 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352802.974397 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352802.975307 +INFO: TimeDuration, Event = Add_end, Time = 0.000910 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352802.975325 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352802.976181 +INFO: TimeDuration, Event = Relu_end, Time = 0.000856 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352802.976195 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352802.979123 +INFO: TimeDuration, Event = Pool_end, Time = 0.002929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.000965 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.001436 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.001467 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.001904 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.018608 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.019084 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.019097 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.019533 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.019546 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.026234 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.036410 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.036707 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.036721 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.036948 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.045647 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.045951 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.045964 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.046190 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.058262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.058568 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.058581 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.058807 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.058821 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.061556 +INFO: TimeDuration, Event = Pool_end, Time = 0.002735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.068149 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.068487 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.068502 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.068625 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.074848 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.075180 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.075195 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.075318 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.082970 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.083300 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.083314 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.083439 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.083456 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.086211 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.088449 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.088734 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.088746 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.088788 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.091071 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.091356 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.091370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.091412 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.093707 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.093994 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.094007 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.094049 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.094062 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.094683 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.094701 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.094791 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.094807 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.094829 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.094842 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.094862 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.094875 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.094923 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.094936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.094955 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352803.094968 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352803.095051 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.703213, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.137779 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.138692 +INFO: TimeDuration, Event = Add_end, Time = 0.000913 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.138708 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.139529 +INFO: TimeDuration, Event = Relu_end, Time = 0.000821 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.154021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.154926 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.154942 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.155791 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.155816 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.158764 +INFO: TimeDuration, Event = Pool_end, Time = 0.002948 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.180596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.181073 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.181088 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.181521 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.198247 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.198724 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.198737 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.199172 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.199185 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.205873 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.216034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.216331 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.216342 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.216569 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.225260 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.225564 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.225577 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.225802 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.237869 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.238171 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.238183 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.238408 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.238421 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.241162 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.247753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.248086 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.248099 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.248221 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.254136 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.254467 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.254480 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.254602 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.262585 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.262916 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.262930 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.263052 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.263070 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.265825 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.268013 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.268296 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.268316 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.268358 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.270617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.270907 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.270919 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.270962 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.273229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.273512 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.273527 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.273568 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.273581 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.274205 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.274223 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.274314 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.274327 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.274355 +INFO: TimeDuration, Event = Add_end, Time = 0.000028 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.274367 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.274386 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.274401 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.274450 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.274463 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.274483 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352803.274497 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352803.274583 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.580767, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.315004 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.315911 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.315926 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.316752 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.331234 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.332134 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.332151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.333001 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.333016 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.335959 +INFO: TimeDuration, Event = Pool_end, Time = 0.002943 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.357801 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.358270 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.358287 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.358727 +INFO: TimeDuration, Event = Relu_end, Time = 0.000440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.375469 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.375943 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.375957 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.376391 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.376434 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.383097 +INFO: TimeDuration, Event = Pool_end, Time = 0.006663 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.393269 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.393568 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.393582 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.393808 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.402507 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.402812 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.402826 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.403051 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.415213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.415521 +INFO: TimeDuration, Event = Add_end, Time = 0.000309 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.415535 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.415778 +INFO: TimeDuration, Event = Relu_end, Time = 0.000243 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.415790 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.418502 +INFO: TimeDuration, Event = Pool_end, Time = 0.002712 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.425082 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.425417 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.425431 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.425552 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.431681 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.432012 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.432038 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.432164 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.440119 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.440450 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.440464 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.440588 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.440610 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.443362 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.445583 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.445867 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.445879 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.445920 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.448212 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.448502 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.448516 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.448559 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.450837 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.451119 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.451133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.451176 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.451189 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.451814 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.451833 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.451926 +INFO: TimeDuration, Event = Mul_end, Time = 0.000093 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.451942 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.451965 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.451980 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.451999 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.452013 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.452065 +INFO: TimeDuration, Event = Mul_end, Time = 0.000052 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.452079 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.452099 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352803.452114 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352803.452253 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000139 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.041191, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.494473 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.495375 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.495391 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.496219 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.510698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.511620 +INFO: TimeDuration, Event = Add_end, Time = 0.000922 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.511639 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.512487 +INFO: TimeDuration, Event = Relu_end, Time = 0.000848 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.512501 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.515425 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.537268 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.537739 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.537754 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.538187 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.554947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.555421 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.555434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.555866 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.555880 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.562574 +INFO: TimeDuration, Event = Pool_end, Time = 0.006694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.573109 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.573406 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.573421 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.573647 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.582337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.582641 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.582654 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.582880 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.594943 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.595244 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.595257 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.595482 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.595494 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.598238 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.604981 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.605318 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.605330 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.605453 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.611674 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.612008 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.612021 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.612142 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.620135 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.620467 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.620482 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.620604 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.620622 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.623376 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.625586 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.625868 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.625880 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.625923 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.628198 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.628490 +INFO: TimeDuration, Event = Add_end, Time = 0.000293 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.628504 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.628547 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.630803 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.631087 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.631101 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.631143 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.631157 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.631777 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.631797 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.631888 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.631902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.631924 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.631936 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.631955 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.631985 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.632035 +INFO: TimeDuration, Event = Mul_end, Time = 0.000050 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.632048 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.632067 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352803.632081 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352803.632163 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.478102, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.672512 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.673439 +INFO: TimeDuration, Event = Add_end, Time = 0.000927 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.673453 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.674263 +INFO: TimeDuration, Event = Relu_end, Time = 0.000811 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.688751 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.689648 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.689664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.690514 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.690529 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.693475 +INFO: TimeDuration, Event = Pool_end, Time = 0.002947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.715315 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.715785 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.715802 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.716237 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.732988 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.733464 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.733477 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.733911 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.733924 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.740614 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.750784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.751081 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.751095 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.751321 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.760018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.760322 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.760433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.760658 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.772664 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.772966 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.772980 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.773206 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.773220 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.775956 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.782539 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.782876 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.782889 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.783010 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.789233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.789565 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.789591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.789714 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.797361 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.797697 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.797709 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.797831 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.797847 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.800600 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.802849 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.803136 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.803149 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.803190 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.805477 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.805762 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.805775 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.805817 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.808123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.808408 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.808419 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.808462 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.808474 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.809098 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.809131 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.809223 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.809237 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.809258 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.809271 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.809291 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.809306 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.809354 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.809366 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.809386 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352803.809399 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352803.809499 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000100 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.773441, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.849681 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.850585 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.850602 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.851429 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.865921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.866825 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.866841 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.867693 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.867706 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.870646 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.892490 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.892960 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.892976 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.893412 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.910255 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.910730 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.910743 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.911176 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.911188 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.917882 +INFO: TimeDuration, Event = Pool_end, Time = 0.006695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.928049 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.928346 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.928358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.928586 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.937466 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.937770 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.937785 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.938010 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.950100 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.950402 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.950416 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.950642 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.950654 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.953396 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.959984 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.960324 +INFO: TimeDuration, Event = Add_end, Time = 0.000340 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.960454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.960575 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.966713 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.967045 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.967059 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.967181 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.975147 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.975478 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.975493 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.975616 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.975636 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.978389 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.980602 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.980889 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.980903 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.980945 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.983239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.983528 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.983542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.983584 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.985903 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.986187 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.986201 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.986243 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352803.986256 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352803.986880 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.986898 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.986989 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.987003 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.987024 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352803.987036 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352803.987056 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352803.987070 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352803.987117 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352803.987130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352803.987150 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352803.987163 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352803.987245 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.219458, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.028768 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.029670 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.029687 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.030512 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.045016 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.045914 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.045930 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.046782 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.046794 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.049732 +INFO: TimeDuration, Event = Pool_end, Time = 0.002938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.072634 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.073105 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.073123 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.073556 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.089249 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.089724 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.089738 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.090170 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.090183 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.097928 +INFO: TimeDuration, Event = Pool_end, Time = 0.007745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.107202 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.107499 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.107513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.107739 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.116639 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.116946 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.116960 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.117187 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.129243 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.129547 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.129577 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.129803 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.129815 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.133589 +INFO: TimeDuration, Event = Pool_end, Time = 0.003774 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.139210 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.139546 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.139559 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.139681 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.145594 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.145933 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.145946 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.146069 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.153715 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.154048 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.154063 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.154185 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.154204 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.156957 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.159165 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.159449 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.159462 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.159504 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.161792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.162079 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.162093 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.162135 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.164439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.164725 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.164738 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.164781 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.164794 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.165386 +INFO: TimeDuration, Event = Pool_end, Time = 0.000592 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.165405 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.165500 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.165515 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.165536 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.165548 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.165568 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.165581 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.165630 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.165644 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.165664 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352804.165679 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352804.165762 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.714526, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.205790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.206689 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.206705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.207528 +INFO: TimeDuration, Event = Relu_end, Time = 0.000823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.222043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.222942 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.222958 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.223805 +INFO: TimeDuration, Event = Relu_end, Time = 0.000847 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.223820 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.226767 +INFO: TimeDuration, Event = Pool_end, Time = 0.002947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.248621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.249092 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.249107 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.249542 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.266266 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.266741 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.266754 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.267187 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.267200 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.273893 +INFO: TimeDuration, Event = Pool_end, Time = 0.006693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.284067 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.284364 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.284434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.284659 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.293303 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.293608 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.293621 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.293846 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.305917 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.306219 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.306233 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.306459 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.306494 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.309215 +INFO: TimeDuration, Event = Pool_end, Time = 0.002721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.315798 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.316135 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.316149 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.316272 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.322181 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.322519 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.322551 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.322673 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.330620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.330950 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.330963 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.331086 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.331108 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.333863 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.336076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.336359 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.336370 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.336413 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.338688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.338976 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.338989 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.339031 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.341303 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.341588 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.341614 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.341657 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.341671 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.342278 +INFO: TimeDuration, Event = Pool_end, Time = 0.000607 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.342298 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.342388 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.342416 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.342438 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.342463 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.342483 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.342499 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.342551 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.342565 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.342585 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352804.342600 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352804.342693 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000093 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.620729, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.382652 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.383560 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.383574 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.384405 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.398900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.399798 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.399813 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.400659 +INFO: TimeDuration, Event = Relu_end, Time = 0.000846 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.400674 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.403627 +INFO: TimeDuration, Event = Pool_end, Time = 0.002953 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.425462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.425934 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.425949 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.426390 +INFO: TimeDuration, Event = Relu_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.443151 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.443627 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.443640 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.444075 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.444088 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.450777 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.460939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.461237 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.461250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.461475 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.470168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.470472 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.470489 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.470714 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.482788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.483089 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.483117 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.483344 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.483357 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.486083 +INFO: TimeDuration, Event = Pool_end, Time = 0.002726 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.492667 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.493002 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.493015 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.493138 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.499050 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.499383 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.499395 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.499518 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.507492 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.507822 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.507835 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.507958 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.507975 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.510731 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.512950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.513239 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.513264 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.513307 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.515579 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.515867 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.515881 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.515923 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.518200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.518482 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.518496 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.518538 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.518550 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.519174 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.519212 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.519307 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.519322 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.519346 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.519367 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.519388 +INFO: TimeDuration, Event = Relu_end, Time = 0.000021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.519403 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.519454 +INFO: TimeDuration, Event = Mul_end, Time = 0.000051 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.519467 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.519486 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352804.519501 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352804.519633 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000132 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.753280, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.559399 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.560314 +INFO: TimeDuration, Event = Add_end, Time = 0.000915 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.560327 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.561154 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.575636 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.576547 +INFO: TimeDuration, Event = Add_end, Time = 0.000910 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.576565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.577417 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.577430 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.580362 +INFO: TimeDuration, Event = Pool_end, Time = 0.002932 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.602204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.602674 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.602690 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.603126 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.619850 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.620326 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.620435 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.620870 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.620883 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.627476 +INFO: TimeDuration, Event = Pool_end, Time = 0.006593 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.637631 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.637929 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.637942 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.638169 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.646859 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.647164 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.647177 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.647403 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.659469 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.659773 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.659786 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.660011 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.660026 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.662763 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.669348 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.669686 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.669713 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.669835 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.676045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.676381 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.676433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.676556 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.684487 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.684818 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.684831 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.684954 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.684971 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.687726 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.689946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.690235 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.690248 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.690290 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.692554 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.692844 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.692857 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.692899 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.695167 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.695452 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.695466 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.695508 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.695521 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.696142 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.696160 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.696251 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.696265 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.696286 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.696298 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.696325 +INFO: TimeDuration, Event = Relu_end, Time = 0.000027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.696435 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.696486 +INFO: TimeDuration, Event = Mul_end, Time = 0.000051 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.696500 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.696520 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352804.696534 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352804.696618 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.727092, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.736525 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.737432 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.737446 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.738277 +INFO: TimeDuration, Event = Relu_end, Time = 0.000831 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.754546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.755467 +INFO: TimeDuration, Event = Add_end, Time = 0.000921 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.755483 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.756339 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.756351 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.759605 +INFO: TimeDuration, Event = Pool_end, Time = 0.003255 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.781424 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.781895 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.781914 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.782348 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.796996 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.797468 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.797481 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.797917 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.797929 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.804623 +INFO: TimeDuration, Event = Pool_end, Time = 0.006694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.815860 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.816162 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.816176 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.816405 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.825082 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.825388 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.825403 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.825631 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.837686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.837986 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.837999 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.838225 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.838237 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.840981 +INFO: TimeDuration, Event = Pool_end, Time = 0.002743 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.847581 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.847908 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.847921 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.848043 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.853972 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.854298 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.854312 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.854434 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.862407 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.862737 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.862750 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.862873 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.862890 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.865648 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.867879 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.868166 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.868184 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.868227 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.870506 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.870796 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.870810 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.870853 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.873144 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.873433 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.873447 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.873489 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.873502 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.874120 +INFO: TimeDuration, Event = Pool_end, Time = 0.000619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.874139 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.874239 +INFO: TimeDuration, Event = Mul_end, Time = 0.000100 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.874268 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.874290 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.874302 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.874322 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352804.874337 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352804.874391 +INFO: TimeDuration, Event = Mul_end, Time = 0.000055 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.874404 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.874424 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352804.874438 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352804.874519 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.725434, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.914560 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.915484 +INFO: TimeDuration, Event = Add_end, Time = 0.000924 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.915500 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.916328 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.931509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.932411 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.932426 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.933283 +INFO: TimeDuration, Event = Relu_end, Time = 0.000856 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.933298 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.936237 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.959130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.959601 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.959619 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.960059 +INFO: TimeDuration, Event = Relu_end, Time = 0.000441 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.976995 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.977472 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.977485 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.977922 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352804.977934 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352804.984632 +INFO: TimeDuration, Event = Pool_end, Time = 0.006698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352804.994784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352804.995081 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352804.995096 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352804.995323 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.004011 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.004316 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.004437 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.004661 +INFO: TimeDuration, Event = Relu_end, Time = 0.000223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.017358 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.017661 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.017675 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.017902 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.017915 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.020652 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.027240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.027575 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.027589 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.027711 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.033625 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.033956 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.033969 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.034092 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.041748 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.042080 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.042093 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.042214 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.042233 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.044989 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.047182 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.047466 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.047478 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.047520 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.049812 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.050100 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.050113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.050156 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.052450 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.052731 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.052758 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.052800 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.052814 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.053423 +INFO: TimeDuration, Event = Pool_end, Time = 0.000609 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.053441 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.053533 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.053548 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.053570 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.053582 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.053601 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.053616 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.053664 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.053678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.053698 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352805.053713 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352805.053797 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.895646, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.093792 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.094697 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.094713 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.095541 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.110030 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.110939 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.110956 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.111805 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.111819 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.114754 +INFO: TimeDuration, Event = Pool_end, Time = 0.002935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.136600 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.137070 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.137086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.137521 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.154269 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.154745 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.154759 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.155195 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.155208 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.163555 +INFO: TimeDuration, Event = Pool_end, Time = 0.008347 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.172291 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.172601 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.172616 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.172843 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.181539 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.181865 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.181880 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.182109 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.194144 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.194447 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.194461 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.194686 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.194698 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.197437 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.204027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.204366 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.204433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.204555 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.210416 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.210744 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.210770 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.210893 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.218554 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.218884 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.218897 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.219019 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.219038 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.221792 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.223987 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.224275 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.224288 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.224330 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.226639 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.226927 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.226941 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.226983 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.229255 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.229540 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.229552 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.229594 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.229608 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.230230 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.230248 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.230341 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.230355 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.230382 +INFO: TimeDuration, Event = Add_end, Time = 0.000028 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.230395 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.230415 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.230429 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.230478 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.230491 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.230511 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352805.230526 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352805.230611 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.447140, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.270505 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.271415 +INFO: TimeDuration, Event = Add_end, Time = 0.000910 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.271432 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.272259 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.286769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.287674 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.287689 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.288543 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.288559 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.291492 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.313335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.313806 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.313822 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.314258 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.330995 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.331471 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.331485 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.331918 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.331932 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.338621 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.348795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.349093 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.349108 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.349333 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.358035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.358339 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.358353 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.358577 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.370652 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.370952 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.370965 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.371192 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.371204 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.373945 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.380534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.380870 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.380883 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.381006 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.387231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.387562 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.387576 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.387700 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.395659 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.395989 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.396002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.396125 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.396142 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.398900 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.401108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.401419 +INFO: TimeDuration, Event = Add_end, Time = 0.000311 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.401433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.401474 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.403708 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.403996 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.404010 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.404051 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.406340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.406622 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.406637 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.406678 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.406693 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.407315 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.407333 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.407424 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.407437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.407458 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.407470 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.407490 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.407521 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.407570 +INFO: TimeDuration, Event = Mul_end, Time = 0.000050 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.407584 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.407604 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352805.407619 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352805.407699 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000080 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.974146, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.447709 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.448612 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.448628 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.449457 +INFO: TimeDuration, Event = Relu_end, Time = 0.000829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.463930 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.464834 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.464851 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.465704 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.465718 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.468656 +INFO: TimeDuration, Event = Pool_end, Time = 0.002938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.490498 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.490968 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.490999 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.491435 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.508118 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.508594 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.508609 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.509045 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.509057 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.515745 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.525902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.526199 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.526211 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.526436 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.535140 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.535444 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.535457 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.535682 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.547752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.548054 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.548068 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.548294 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.548311 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.551044 +INFO: TimeDuration, Event = Pool_end, Time = 0.002733 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.557640 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.557978 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.557991 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.558112 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.565586 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.565919 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.565933 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.566058 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.574007 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.574338 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.574351 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.574472 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.574492 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.577260 +INFO: TimeDuration, Event = Pool_end, Time = 0.002767 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.579465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.579754 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.579769 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.579811 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.582090 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.582380 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.582394 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.582436 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.584723 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.585008 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.585022 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.585064 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.585076 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.585702 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.585721 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.585816 +INFO: TimeDuration, Event = Mul_end, Time = 0.000095 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.585830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.585853 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.585865 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.585884 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.585899 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.585951 +INFO: TimeDuration, Event = Mul_end, Time = 0.000052 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.585964 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.585984 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352805.585999 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352805.586134 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000136 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.183094, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.628484 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.629389 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.629407 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.630227 +INFO: TimeDuration, Event = Relu_end, Time = 0.000820 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.644716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.645617 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.645633 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.646484 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.646498 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.649443 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.671288 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.671760 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.671776 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.672212 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.688953 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.689428 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.689442 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.689876 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.689889 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.696585 +INFO: TimeDuration, Event = Pool_end, Time = 0.006696 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.706760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.707058 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.707072 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.707297 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.715981 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.716285 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.716298 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.716531 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.728631 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.728933 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.728947 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.729172 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.729185 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.731922 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.738525 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.738862 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.738874 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.738996 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.745231 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.745559 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.745572 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.745695 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.753355 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.753683 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.753696 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.753818 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.753838 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.756591 +INFO: TimeDuration, Event = Pool_end, Time = 0.002753 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.758810 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.759098 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.759112 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.759154 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.761436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.761726 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.761739 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.761781 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.764066 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.764350 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.764467 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.764510 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.764524 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.765041 +INFO: TimeDuration, Event = Pool_end, Time = 0.000517 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.765058 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.765149 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.765163 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.765184 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.765209 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.765229 +INFO: TimeDuration, Event = Relu_end, Time = 0.000021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.765244 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.765293 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.765308 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.765328 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352805.765342 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352805.765425 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.637264, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.805665 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.806571 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.806586 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.807414 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.821979 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.822887 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.822904 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.823760 +INFO: TimeDuration, Event = Relu_end, Time = 0.000856 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.823773 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.826704 +INFO: TimeDuration, Event = Pool_end, Time = 0.002931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.848552 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.849022 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.849038 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.849475 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.866176 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.866651 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.866664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.867098 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.867111 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.873801 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.883973 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.884270 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.884284 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.884512 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.893199 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.893503 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.893516 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.893740 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.905815 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.906116 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.906129 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.906357 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.906370 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.909111 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.915699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.916033 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.916046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.916167 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.922098 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.922429 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.922443 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.922566 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.930218 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.930552 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.930565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.930686 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.930704 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.933459 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.935663 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.935950 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.935964 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.936006 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.938299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.938587 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.938601 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.938643 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.940924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.941209 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.941223 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.941265 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352805.941277 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352805.941903 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.941921 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.942012 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.942027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.942049 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.942062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.942081 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352805.942096 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352805.942144 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.942157 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.942176 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352805.942191 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352805.942274 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.498630, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.982619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.983526 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.983541 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352805.984375 +INFO: TimeDuration, Event = Relu_end, Time = 0.000833 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352805.998976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352805.999879 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352805.999895 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.000752 +INFO: TimeDuration, Event = Relu_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.000766 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.003701 +INFO: TimeDuration, Event = Pool_end, Time = 0.002935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.025542 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.026011 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.026043 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.026476 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.043229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.043703 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.043716 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.044148 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.044160 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.050856 +INFO: TimeDuration, Event = Pool_end, Time = 0.006696 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.061018 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.061315 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.061325 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.061551 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.070399 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.070703 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.070716 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.070944 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.083053 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.083354 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.083367 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.083594 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.083607 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.086348 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.092934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.093274 +INFO: TimeDuration, Event = Add_end, Time = 0.000340 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.093287 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.093407 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.099320 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.099652 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.099665 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.099788 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.108130 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.108466 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.108480 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.108604 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.108626 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.111368 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.113587 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.113874 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.113886 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.113929 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.116226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.116513 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.116528 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.116571 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.118864 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.119145 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.119158 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.119200 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.119213 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.119837 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.119872 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.119964 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.119977 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.119999 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.120011 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.120032 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.120046 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.120100 +INFO: TimeDuration, Event = Mul_end, Time = 0.000054 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.120115 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.120135 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352806.120148 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352806.120247 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000098 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.370726, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.160546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.161451 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.161468 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.162293 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.176810 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.177710 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.177726 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.178576 +INFO: TimeDuration, Event = Relu_end, Time = 0.000850 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.178591 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.181535 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.203377 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.203848 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.203864 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.204298 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.221106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.221583 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.221597 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.222034 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.222048 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.228731 +INFO: TimeDuration, Event = Pool_end, Time = 0.006683 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.238902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.239201 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.239213 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.239442 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.248131 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.248436 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.248451 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.248676 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.260748 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.261049 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.261061 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.261293 +INFO: TimeDuration, Event = Relu_end, Time = 0.000232 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.261307 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.264042 +INFO: TimeDuration, Event = Pool_end, Time = 0.002735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.270649 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.270984 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.270997 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.271118 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.277359 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.277693 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.277705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.277833 +INFO: TimeDuration, Event = Relu_end, Time = 0.000128 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.285783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.286110 +INFO: TimeDuration, Event = Add_end, Time = 0.000327 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.286124 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.286245 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.286263 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.289021 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.291240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.291524 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.291551 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.291594 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.293910 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.294198 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.294210 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.294252 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.296536 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.296818 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.296831 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.296872 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.296885 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.297510 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.297529 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.297620 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.297634 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.297655 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.297668 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.297687 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.297702 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.297750 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.297780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.297804 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352806.297818 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352806.297903 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.016582, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.338076 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.338982 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.338998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.339824 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.354384 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.355292 +INFO: TimeDuration, Event = Add_end, Time = 0.000908 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.355307 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.356163 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.356177 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.359111 +INFO: TimeDuration, Event = Pool_end, Time = 0.002934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.380963 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.381434 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.381452 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.381887 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.398594 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.399069 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.399083 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.399517 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.399543 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.406222 +INFO: TimeDuration, Event = Pool_end, Time = 0.006679 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.416396 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.416694 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.416708 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.416934 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.425660 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.425963 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.425976 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.426201 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.438268 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.438570 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.438582 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.438809 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.438822 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.441561 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.448156 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.448491 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.448506 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.448629 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.454534 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.454865 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.454879 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.455002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.462654 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.462984 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.462998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.463119 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.463136 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.465895 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.468103 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.468390 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.468401 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.468443 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.470719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.471009 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.471022 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.471064 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.473336 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.473620 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.473634 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.473675 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.473687 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.474311 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.474329 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.474420 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.474433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.474458 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.474470 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.474490 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.474504 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.474551 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.474566 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.474586 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352806.474599 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352806.474682 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.376331, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.515105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.516007 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.516024 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.516852 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.531370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.532276 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.532293 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.533151 +INFO: TimeDuration, Event = Relu_end, Time = 0.000859 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.533166 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.536097 +INFO: TimeDuration, Event = Pool_end, Time = 0.002931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.557946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.558415 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.558430 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.558863 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.575573 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.576047 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.576060 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.576493 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.576505 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.583200 +INFO: TimeDuration, Event = Pool_end, Time = 0.006694 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.593369 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.593684 +INFO: TimeDuration, Event = Add_end, Time = 0.000315 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.593698 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.593923 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.602595 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.602898 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.602912 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.603139 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.615208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.615510 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.615523 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.615749 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.615763 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.618501 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.625091 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.625430 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.625444 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.625566 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.631794 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.632128 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.632142 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.632266 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.639918 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.640247 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.640260 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.640382 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.640397 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.643159 +INFO: TimeDuration, Event = Pool_end, Time = 0.002762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.645366 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.645651 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.645665 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.645707 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.647990 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.648278 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.648291 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.648333 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.650622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.650915 +INFO: TimeDuration, Event = Add_end, Time = 0.000293 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.650929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.650971 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.650985 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.651599 +INFO: TimeDuration, Event = Pool_end, Time = 0.000614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.651616 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.651706 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.651719 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.651740 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.651772 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.651794 +INFO: TimeDuration, Event = Relu_end, Time = 0.000021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.651808 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.651856 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.651871 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.651891 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352806.651915 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352806.651999 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.762873, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.692123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.693026 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.693041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.693866 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.708349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.709248 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.709266 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.710119 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.710133 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.713075 +INFO: TimeDuration, Event = Pool_end, Time = 0.002942 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.734915 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.735384 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.735399 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.735835 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.752591 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.753066 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.753079 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.753513 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.753526 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.760215 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.770400 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.770698 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.770711 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.770936 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.779617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.779921 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.779934 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.780160 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.792228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.792528 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.792542 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.792768 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.792781 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.795522 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.802118 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.802455 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.802468 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.802589 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.808818 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.809148 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.809160 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.809283 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.817249 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.817579 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.817592 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.817713 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.817745 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.820491 +INFO: TimeDuration, Event = Pool_end, Time = 0.002746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.822754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.823042 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.823056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.823098 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.825369 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.825657 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.825670 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.825712 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.827999 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.828286 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.828300 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.828350 +INFO: TimeDuration, Event = Relu_end, Time = 0.000049 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.828360 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.828976 +INFO: TimeDuration, Event = Pool_end, Time = 0.000616 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.828995 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.829084 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.829100 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.829121 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.829133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.829152 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352806.829168 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352806.829216 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.829229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.829248 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352806.829261 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352806.829343 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.970732, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.869462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.870357 +INFO: TimeDuration, Event = Add_end, Time = 0.000896 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.870372 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.871203 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.885689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.886590 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.886609 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.887458 +INFO: TimeDuration, Event = Relu_end, Time = 0.000849 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.887472 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.890416 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.912254 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.912724 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.912741 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.913176 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.929916 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.930391 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.930405 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.930843 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.930856 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.937543 +INFO: TimeDuration, Event = Pool_end, Time = 0.006687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.947700 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.947997 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.948009 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.948236 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.956926 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.957230 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.957242 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.957468 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.969550 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.969852 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.969864 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.970090 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.970104 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.972840 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.979489 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.979825 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.979849 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.979972 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.985898 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.986228 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.986242 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.986364 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.994335 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352806.994665 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352806.994678 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352806.994802 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352806.994830 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352806.997580 +INFO: TimeDuration, Event = Pool_end, Time = 0.002750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352806.999787 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.000075 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.000089 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.000131 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.002418 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.002706 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.002720 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.002763 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.005034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.005331 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.005358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.005400 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.005412 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.006010 +INFO: TimeDuration, Event = Pool_end, Time = 0.000598 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.006028 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.006118 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.006132 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.006153 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.006177 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.006202 +INFO: TimeDuration, Event = Relu_end, Time = 0.000024 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.006218 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.006266 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.006280 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.006299 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352807.006341 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352807.006428 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.692560, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.046567 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.047473 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.047489 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.048316 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.063208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.064112 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.064128 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.064980 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.064995 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.067935 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.089775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.090245 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.090263 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.090697 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.107420 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.107895 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.107910 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.108344 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.108355 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.115046 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.125213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.125510 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.125546 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.125772 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.134589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.134894 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.134908 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.135134 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.147204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.147506 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.147520 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.147745 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.147759 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.150496 +INFO: TimeDuration, Event = Pool_end, Time = 0.002737 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.157085 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.157418 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.157432 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.157554 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.163776 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.164109 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.164122 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.164245 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.171985 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.172579 +INFO: TimeDuration, Event = Add_end, Time = 0.000594 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.172606 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.172739 +INFO: TimeDuration, Event = Relu_end, Time = 0.000133 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.172756 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.175228 +INFO: TimeDuration, Event = Pool_end, Time = 0.002472 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.177426 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.177716 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.177730 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.177773 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.180038 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.180325 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.180434 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.180478 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.182654 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.182937 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.182951 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.182993 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.183007 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.183629 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.183649 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.183743 +INFO: TimeDuration, Event = Mul_end, Time = 0.000094 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.183770 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.183795 +INFO: TimeDuration, Event = Add_end, Time = 0.000026 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.183807 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.183827 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.183841 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.183896 +INFO: TimeDuration, Event = Mul_end, Time = 0.000054 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.183908 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.183928 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352807.183943 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352807.184028 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.964245, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.224880 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.225783 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.225798 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.226620 +INFO: TimeDuration, Event = Relu_end, Time = 0.000822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.241129 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.242028 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.242046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.242897 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.242910 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.245851 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.268745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.269215 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.269231 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.269665 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.285360 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.285836 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.285849 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.286284 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.286298 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.292991 +INFO: TimeDuration, Event = Pool_end, Time = 0.006693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.303178 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.303477 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.303505 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.303732 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.312788 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.313093 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.313106 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.313332 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.325403 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.325704 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.325717 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.325943 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.325956 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.329754 +INFO: TimeDuration, Event = Pool_end, Time = 0.003797 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.335377 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.335714 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.335728 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.335850 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.341767 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.342100 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.342113 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.342234 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.349889 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.350222 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.350235 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.350356 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.350374 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.353128 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.355321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.355608 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.355623 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.355665 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.357949 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.358236 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.358249 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.358290 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.360582 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.360870 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.360883 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.360925 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.360937 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.361570 +INFO: TimeDuration, Event = Pool_end, Time = 0.000633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.361588 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.361678 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.361693 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.361714 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.361727 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.361747 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.361776 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.361825 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.361839 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.361859 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352807.361873 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352807.361956 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.884572, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.401926 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.402827 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.402842 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.403670 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.418171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.419075 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.419090 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.419943 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.419956 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.422902 +INFO: TimeDuration, Event = Pool_end, Time = 0.002946 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.444738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.445207 +INFO: TimeDuration, Event = Add_end, Time = 0.000468 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.445222 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.445661 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.462394 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.462870 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.462882 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.463319 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.463331 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.470020 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.480184 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.480483 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.480497 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.480725 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.489412 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.489716 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.489729 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.489955 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.502027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.502329 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.502342 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.502569 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.502583 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.505321 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.511900 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.512238 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.512250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.512372 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.518303 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.518636 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.518649 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.518770 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.526486 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.526819 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.526833 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.526955 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.526973 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.529727 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.531936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.532226 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.532239 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.532281 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.534555 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.534840 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.534853 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.534894 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.537193 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.537475 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.537490 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.537532 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.537544 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.538167 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.538185 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.538277 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.538291 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.538311 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.538323 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.538343 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.538356 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.538404 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.538430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.538450 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352807.538464 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352807.538548 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.308292, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.578835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.579744 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.579761 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.580588 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.595072 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.595978 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.595995 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.596849 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.596863 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.599796 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.621638 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.622109 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.622126 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.622564 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.639290 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.639766 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.639779 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.640215 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.640227 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.646918 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.657091 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.657388 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.657401 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.657628 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.666317 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.666622 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.666646 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.666872 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.678931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.679237 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.679269 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.679494 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.679507 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.682227 +INFO: TimeDuration, Event = Pool_end, Time = 0.002720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.688807 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.689142 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.689155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.689276 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.695499 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.695828 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.695844 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.695966 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.703624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.703955 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.703970 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.704091 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.704109 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.706864 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.711116 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.711407 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.711420 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.711463 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.713790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.714082 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.714097 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.714140 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.716489 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.716778 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.716792 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.716834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.716847 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.717393 +INFO: TimeDuration, Event = Pool_end, Time = 0.000546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.717412 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.717508 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.717522 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.717543 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.717556 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.717575 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.717590 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.717662 +INFO: TimeDuration, Event = Mul_end, Time = 0.000072 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.717677 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.717697 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352807.717710 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352807.717794 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000084 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.704862, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.758086 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.758986 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.759004 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.759830 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.774325 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.775226 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.775241 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.776090 +INFO: TimeDuration, Event = Relu_end, Time = 0.000848 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.776104 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.779049 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.800892 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.801363 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.801379 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.801814 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.818576 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.819051 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.819066 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.819500 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.819513 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.826190 +INFO: TimeDuration, Event = Pool_end, Time = 0.006676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.836365 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.836662 +INFO: TimeDuration, Event = Add_end, Time = 0.000296 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.836676 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.836902 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.845641 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.845949 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.845962 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.846188 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.858251 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.858553 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.858566 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.858794 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.858807 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.861544 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.868148 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.868486 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.868501 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.868624 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.874526 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.874859 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.874873 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.874996 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.882997 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.883326 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.883339 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.883462 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.883480 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.886219 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.888440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.888729 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.888743 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.888785 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.891043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.891331 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.891358 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.891401 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.893676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.893960 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.893974 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.894017 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.894031 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.894652 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.894671 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.894762 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.894775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.894797 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.894809 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.894829 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352807.894842 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352807.894890 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.894903 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.894922 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352807.894936 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352807.895018 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.667106, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.935116 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.936011 +INFO: TimeDuration, Event = Add_end, Time = 0.000895 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.936026 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.936857 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.951328 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.952232 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.952249 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.953103 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.953119 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352807.956054 +INFO: TimeDuration, Event = Pool_end, Time = 0.002936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.980135 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.980622 +INFO: TimeDuration, Event = Add_end, Time = 0.000487 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.980641 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.981083 +INFO: TimeDuration, Event = Relu_end, Time = 0.000442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352807.995986 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352807.996459 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352807.996474 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352807.996912 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352807.996926 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.003614 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.013783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.014084 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.014098 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.014326 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.023009 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.023315 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.023333 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.023561 +INFO: TimeDuration, Event = Relu_end, Time = 0.000229 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.035615 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.035915 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.035928 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.036155 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.036168 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.038909 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.045503 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.045830 +INFO: TimeDuration, Event = Add_end, Time = 0.000326 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.045844 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.045966 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.052204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.052536 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.052550 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.052673 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.060643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.060974 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.060988 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.061110 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.061130 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.063884 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.066105 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.066393 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.066406 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.066448 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.068734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.069023 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.069048 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.069090 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.071365 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.071650 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.071664 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.071706 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.071719 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.072382 +INFO: TimeDuration, Event = Pool_end, Time = 0.000664 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.072440 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.072542 +INFO: TimeDuration, Event = Mul_end, Time = 0.000102 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.072556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.072577 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.072590 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.072610 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.072624 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.072678 +INFO: TimeDuration, Event = Mul_end, Time = 0.000054 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.072700 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.072720 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352808.072735 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352808.072817 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.496158, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.112902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.113806 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.113820 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.114645 +INFO: TimeDuration, Event = Relu_end, Time = 0.000824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.129192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.130094 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.130110 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.130964 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.130978 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.133917 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.155756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.156226 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.156242 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.156680 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.173432 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.173909 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.173922 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.174358 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.174370 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.181059 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.191246 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.191543 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.191556 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.191783 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.200460 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.200765 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.200778 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.201003 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.213068 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.213369 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.213382 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.213608 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.213622 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.216363 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.222981 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.223317 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.223330 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.223452 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.229671 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.230000 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.230013 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.230136 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.238118 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.238451 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.238464 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.238586 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.238604 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.241344 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.243562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.243850 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.243865 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.243906 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.246163 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.246450 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.246462 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.246504 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.248765 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.249049 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.249062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.249104 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.249116 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.249740 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.249758 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.249848 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.249863 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.249883 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.249897 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.249916 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.249930 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.249979 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.249993 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.250012 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352808.250027 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352808.250110 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.939692, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.290290 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.291190 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.291207 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.292035 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.306539 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.307444 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.307461 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.308314 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.308326 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.312350 +INFO: TimeDuration, Event = Pool_end, Time = 0.004024 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.334160 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.334630 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.334646 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.335085 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.350780 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.351256 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.351281 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.351719 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.351732 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.358408 +INFO: TimeDuration, Event = Pool_end, Time = 0.006677 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.368653 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.368952 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.368964 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.369192 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.377871 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.378175 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.378188 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.378412 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.390486 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.390789 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.390802 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.391027 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.391041 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.393780 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.400382 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.400718 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.400733 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.400853 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.406768 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.407100 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.407114 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.407236 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.415197 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.415526 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.415538 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.415660 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.415678 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.418438 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.420645 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.420929 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.420943 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.420985 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.423262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.423550 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.423563 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.423604 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.425897 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.426184 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.426198 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.426241 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.426254 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.426872 +INFO: TimeDuration, Event = Pool_end, Time = 0.000617 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.426891 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.426980 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.426994 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.427016 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.427028 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.427047 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.427086 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.427136 +INFO: TimeDuration, Event = Mul_end, Time = 0.000050 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.427150 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.427170 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352808.427185 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352808.427267 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.740460, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.467312 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.468216 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.468230 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.469057 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.483531 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.484438 +INFO: TimeDuration, Event = Add_end, Time = 0.000906 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.484455 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.485311 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.485324 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.488257 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.511147 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.511616 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.511632 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.512064 +INFO: TimeDuration, Event = Relu_end, Time = 0.000431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.529546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.530028 +INFO: TimeDuration, Event = Add_end, Time = 0.000481 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.530041 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.530474 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.530487 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.537173 +INFO: TimeDuration, Event = Pool_end, Time = 0.006685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.547451 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.547749 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.547762 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.547988 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.556826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.557131 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.557156 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.557383 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.569448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.569750 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.569762 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.569988 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.570000 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.572744 +INFO: TimeDuration, Event = Pool_end, Time = 0.002744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.579331 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.579663 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.579676 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.579799 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.585718 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.586053 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.586067 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.586190 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.593837 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.594168 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.594181 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.594304 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.594321 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.597079 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.599295 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.599581 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.599593 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.599634 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.601923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.602210 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.602223 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.602266 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.604562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.604843 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.604855 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.604898 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.604911 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.605535 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.605567 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.605660 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.605675 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.605697 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.605708 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.605728 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.605741 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.605789 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.605803 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.605823 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352808.605837 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352808.605919 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000081 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.443818, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.646085 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.646995 +INFO: TimeDuration, Event = Add_end, Time = 0.000911 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.647010 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.647835 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.662366 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.663274 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.663289 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.664141 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.664154 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.667090 +INFO: TimeDuration, Event = Pool_end, Time = 0.002936 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.688932 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.689403 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.689419 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.689852 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.706583 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.707077 +INFO: TimeDuration, Event = Add_end, Time = 0.000494 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.707092 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.707529 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.707542 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.714218 +INFO: TimeDuration, Event = Pool_end, Time = 0.006676 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.724382 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.724680 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.724693 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.724920 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.733619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.733924 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.733938 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.734163 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.746239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.746539 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.746553 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.746786 +INFO: TimeDuration, Event = Relu_end, Time = 0.000233 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.746800 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.749531 +INFO: TimeDuration, Event = Pool_end, Time = 0.002731 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.756124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.756462 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.756475 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.756598 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.762814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.763148 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.763162 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.763286 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.771250 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.771579 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.771591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.771713 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.771732 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.776736 +INFO: TimeDuration, Event = Pool_end, Time = 0.005003 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.778593 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.778886 +INFO: TimeDuration, Event = Add_end, Time = 0.000294 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.778899 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.778941 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.781235 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.781518 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.781530 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.781572 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.783855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.784139 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.784152 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.784194 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.784207 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.784829 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.784848 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.784940 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.784956 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.784983 +INFO: TimeDuration, Event = Add_end, Time = 0.000028 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.784995 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.785015 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.785030 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.785078 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.785091 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.785110 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352808.785125 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352808.785211 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.903633, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.825109 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.826015 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.826032 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.826855 +INFO: TimeDuration, Event = Relu_end, Time = 0.000823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.841349 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.842253 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.842269 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.843123 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.843138 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.846072 +INFO: TimeDuration, Event = Pool_end, Time = 0.002934 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.867913 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.868385 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.868399 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.868834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.885579 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.886055 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.886068 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.886501 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.886514 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.893205 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.903363 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.903661 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.903673 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.903900 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.912598 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.912902 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.912915 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.913142 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.925214 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.925517 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.925530 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.925756 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.925770 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.928505 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.935099 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.935432 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.935445 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.935566 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.941808 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.942163 +INFO: TimeDuration, Event = Add_end, Time = 0.000355 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.942177 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.942300 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.950240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.950572 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.950585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.950707 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.950725 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.953481 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.955703 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.955990 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.956004 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.956046 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.958322 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.958610 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.958623 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.958664 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.960947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.961236 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.961248 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.961291 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352808.961321 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352808.961924 +INFO: TimeDuration, Event = Pool_end, Time = 0.000602 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.961945 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.962053 +INFO: TimeDuration, Event = Mul_end, Time = 0.000108 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.962067 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.962088 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352808.962101 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352808.962121 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352808.962135 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352808.962190 +INFO: TimeDuration, Event = Mul_end, Time = 0.000054 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352808.962204 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352808.962224 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352808.962255 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352808.962338 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000083 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.998873, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.002643 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.003544 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.003559 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.004386 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.018940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.019845 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.019861 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.020712 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.020726 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.023667 +INFO: TimeDuration, Event = Pool_end, Time = 0.002941 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.045509 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.045978 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.045994 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.046423 +INFO: TimeDuration, Event = Relu_end, Time = 0.000430 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.063173 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.063654 +INFO: TimeDuration, Event = Add_end, Time = 0.000481 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.063668 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.064103 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.064116 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.070800 +INFO: TimeDuration, Event = Pool_end, Time = 0.006684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.080971 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.081269 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.081282 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.081509 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.090200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.090505 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.090518 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.090744 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.102814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.103116 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.103129 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.103355 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.103369 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.106110 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.112695 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.113033 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.113045 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.113167 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.119081 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.119413 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.119427 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.119549 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.127205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.127535 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.127548 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.127670 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.127688 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.130446 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.132636 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.132919 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.132944 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.132987 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.135262 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.135551 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.135564 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.135606 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.137940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.138224 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.138237 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.138280 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.138292 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.138916 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.138934 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.139024 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.139066 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.139088 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.139102 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.139122 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.139137 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.139186 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.139200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.139220 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352809.139235 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352809.139317 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000082 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.375136, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.179608 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.180514 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.180532 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.181358 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.195832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.196733 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.196750 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.197604 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.197617 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.200557 +INFO: TimeDuration, Event = Pool_end, Time = 0.002939 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.222397 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.222867 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.222898 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.223333 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.240062 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.240539 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.240553 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.240992 +INFO: TimeDuration, Event = Relu_end, Time = 0.000438 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.241004 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.247690 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.257855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.258154 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.258167 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.258395 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.267089 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.267393 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.267407 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.267632 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.279709 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.280009 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.280023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.280249 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.280262 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.283003 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.289589 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.289926 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.289940 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.290062 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.296287 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.296627 +INFO: TimeDuration, Event = Add_end, Time = 0.000341 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.296640 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.296763 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.304425 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.304755 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.304770 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.304891 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.304910 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.313717 +INFO: TimeDuration, Event = Pool_end, Time = 0.008807 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.315597 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.315890 +INFO: TimeDuration, Event = Add_end, Time = 0.000293 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.315905 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.315950 +INFO: TimeDuration, Event = Relu_end, Time = 0.000045 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.318284 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.318576 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.318594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.318640 +INFO: TimeDuration, Event = Relu_end, Time = 0.000046 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.320968 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.321258 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.321273 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.321317 +INFO: TimeDuration, Event = Relu_end, Time = 0.000044 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.321334 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.321946 +INFO: TimeDuration, Event = Pool_end, Time = 0.000613 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.321968 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.322065 +INFO: TimeDuration, Event = Mul_end, Time = 0.000098 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.322081 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.322105 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.322118 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.322139 +INFO: TimeDuration, Event = Relu_end, Time = 0.000022 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.322158 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.322211 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.322226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.322247 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352809.322263 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352809.322352 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000090 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 145.504905, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.362835 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.363736 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.363750 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.364583 +INFO: TimeDuration, Event = Relu_end, Time = 0.000833 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.379089 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.379987 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.380001 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.380853 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.380868 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.383813 +INFO: TimeDuration, Event = Pool_end, Time = 0.002945 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.405653 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.406123 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.406140 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.406574 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.423329 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.423804 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.423816 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.424253 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.424266 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.430954 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.441125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.441427 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.441440 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.441666 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.450362 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.450666 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.450699 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.450931 +INFO: TimeDuration, Event = Relu_end, Time = 0.000232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.462976 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.463276 +INFO: TimeDuration, Event = Add_end, Time = 0.000300 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.463290 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.463516 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.463530 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.466269 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.472866 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.473199 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.473212 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.473335 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.479240 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.479575 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.479591 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.479712 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.487377 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.487708 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.487720 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.487842 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.487928 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.490619 +INFO: TimeDuration, Event = Pool_end, Time = 0.002691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.492827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.493110 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.493122 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.493164 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.495427 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.495712 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.495724 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.495766 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.498066 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.498350 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.498362 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.498405 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.498417 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.499024 +INFO: TimeDuration, Event = Pool_end, Time = 0.000608 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.499043 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.499134 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.499150 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.499171 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.499183 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.499203 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.499218 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.499265 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.499278 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.499297 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352809.499311 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352809.499396 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.267951, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.539274 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.540176 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.540191 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.541021 +INFO: TimeDuration, Event = Relu_end, Time = 0.000830 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.555517 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.556420 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.556438 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.557294 +INFO: TimeDuration, Event = Relu_end, Time = 0.000857 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.557307 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.560242 +INFO: TimeDuration, Event = Pool_end, Time = 0.002935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.582084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.582555 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.582571 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.583005 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.599716 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.600190 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.600204 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.600639 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.600653 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.607343 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.617506 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.617803 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.617815 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.618041 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.626736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.627041 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.627055 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.627280 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.639351 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.639652 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.639667 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.639894 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.639907 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.644218 +INFO: TimeDuration, Event = Pool_end, Time = 0.004311 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.649346 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.649686 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.649699 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.649823 +INFO: TimeDuration, Event = Relu_end, Time = 0.000125 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.656063 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.656397 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.656433 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.656557 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.664205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.664536 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.664550 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.664673 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.664696 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.667446 +INFO: TimeDuration, Event = Pool_end, Time = 0.002750 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.669660 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.669947 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.669960 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.670002 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.672285 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.672573 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.672587 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.672630 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.674927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.675211 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.675224 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.675266 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.675279 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.675901 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.675920 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.676015 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.676029 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.676052 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.676064 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.676085 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.676098 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.676150 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.676165 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.676184 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352809.676199 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352809.676344 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000145 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.871663, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.721203 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.722105 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.722120 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.722945 +INFO: TimeDuration, Event = Relu_end, Time = 0.000825 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.737467 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.738369 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.738385 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.739239 +INFO: TimeDuration, Event = Relu_end, Time = 0.000854 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.739253 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.742197 +INFO: TimeDuration, Event = Pool_end, Time = 0.002943 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.764034 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.764512 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.764530 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.764967 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.781673 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.782150 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.782162 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.782598 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.782611 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.789301 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.799466 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.799764 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.799778 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.800006 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.808703 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.809009 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.809021 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.809247 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.821320 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.821622 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.821634 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.821861 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.821874 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.824613 +INFO: TimeDuration, Event = Pool_end, Time = 0.002739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.831200 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.831535 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.831548 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.831669 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.837902 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.838236 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.838250 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.838372 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.846337 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.846670 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.846684 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.846805 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.846823 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.849579 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.851790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.852076 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.852089 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.852131 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.854425 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.854713 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.854726 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.854767 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.857035 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.857318 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.857331 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.857373 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.857386 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.858008 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.858050 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.858143 +INFO: TimeDuration, Event = Mul_end, Time = 0.000093 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.858171 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.858194 +INFO: TimeDuration, Event = Add_end, Time = 0.000023 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.858206 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.858226 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352809.858240 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352809.858288 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.858301 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.858321 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352809.858335 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352809.858420 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.977059, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.898497 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.899401 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.899416 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.900240 +INFO: TimeDuration, Event = Relu_end, Time = 0.000824 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.914741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.915638 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.915654 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.916506 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.916522 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.919466 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.941307 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.941779 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.941795 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.942231 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.958975 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.959451 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.959464 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.959899 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.959913 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352809.966604 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.976782 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.977081 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.977095 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.977321 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.986049 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.986353 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.986367 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.986593 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352809.998670 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352809.998971 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352809.998984 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352809.999211 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352809.999222 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.001962 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.008564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.008902 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.008915 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.009037 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.014952 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.015285 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.015298 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.015419 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.023388 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.023718 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.023730 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.023851 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.023868 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.026630 +INFO: TimeDuration, Event = Pool_end, Time = 0.002762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.028837 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.029123 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.029136 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.029177 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.031439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.031727 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.031741 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.031783 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.034070 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.034353 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.034367 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.034409 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.034422 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.035045 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.035064 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.035155 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.035170 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.035192 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.035203 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.035223 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.035238 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.035285 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.035299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.035319 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352810.035333 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352810.035417 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000085 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.668895, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.075437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.076349 +INFO: TimeDuration, Event = Add_end, Time = 0.000912 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.076477 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.077291 +INFO: TimeDuration, Event = Relu_end, Time = 0.000814 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.091784 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.092687 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.092704 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.093559 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.093572 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.096509 +INFO: TimeDuration, Event = Pool_end, Time = 0.002937 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.118350 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.118820 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.118857 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.119291 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.136002 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.136479 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.136493 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.136929 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.136942 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.143631 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.153783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.154080 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.154092 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.154319 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.163014 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.163317 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.163332 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.163557 +INFO: TimeDuration, Event = Relu_end, Time = 0.000224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.175637 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.175938 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.175951 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.176178 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.176192 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.178934 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.185517 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.185856 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.185869 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.185992 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.192218 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.192549 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.192563 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.192686 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.200654 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.200984 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.200998 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.201121 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.201139 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.203894 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.206094 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.206380 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.206394 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.206435 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.208721 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.209010 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.209023 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.209064 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.211344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.211630 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.211644 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.211687 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.211701 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.212320 +INFO: TimeDuration, Event = Pool_end, Time = 0.000619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.212437 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.212530 +INFO: TimeDuration, Event = Mul_end, Time = 0.000093 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.212545 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.212566 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.212577 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.212597 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.212612 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.212659 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.212672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.212691 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352810.212706 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352810.212793 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.939832, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.252744 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.253649 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.253663 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.254492 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.269056 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.269957 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.269972 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.270832 +INFO: TimeDuration, Event = Relu_end, Time = 0.000860 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.270846 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.273781 +INFO: TimeDuration, Event = Pool_end, Time = 0.002935 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.295620 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.296089 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.296107 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.296542 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.313686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.314160 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.314173 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.314604 +INFO: TimeDuration, Event = Relu_end, Time = 0.000431 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.314617 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.321312 +INFO: TimeDuration, Event = Pool_end, Time = 0.006696 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.331484 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.331781 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.331793 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.332020 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.340874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.341179 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.341208 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.341434 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.353480 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.353782 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.353795 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.354020 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.354033 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.356775 +INFO: TimeDuration, Event = Pool_end, Time = 0.002742 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.363362 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.363716 +INFO: TimeDuration, Event = Add_end, Time = 0.000354 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.363732 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.363854 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.369746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.370075 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.370088 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.370211 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.377874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.378203 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.378217 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.378339 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.378356 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.381113 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.383325 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.383608 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.383622 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.383664 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.385959 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.386248 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.386261 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.386303 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.388578 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.388861 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.388873 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.388914 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.388928 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.389549 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.389567 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.389657 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.389672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.389693 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.389705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.389725 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.389739 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.389786 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.389799 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.389818 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352810.389832 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352810.389920 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.962938, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.431065 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.431974 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.431989 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.432817 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.447327 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.448236 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.448251 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.449104 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.449119 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.452052 +INFO: TimeDuration, Event = Pool_end, Time = 0.002933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.474943 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.475414 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.475431 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.475866 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.491561 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.492038 +INFO: TimeDuration, Event = Add_end, Time = 0.000477 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.492052 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.492491 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.492505 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.499196 +INFO: TimeDuration, Event = Pool_end, Time = 0.006691 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.509355 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.509655 +INFO: TimeDuration, Event = Add_end, Time = 0.000299 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.509682 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.509910 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.518956 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.519263 +INFO: TimeDuration, Event = Add_end, Time = 0.000307 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.519276 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.519503 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.531570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.531872 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.531893 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.532118 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.532132 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.535915 +INFO: TimeDuration, Event = Pool_end, Time = 0.003783 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.541546 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.541881 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.541894 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.542016 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.548238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.548571 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.548585 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.548707 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.556368 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.556697 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.556710 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.556832 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.556851 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.559608 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.561857 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.562143 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.562156 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.562198 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.564456 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.564742 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.564754 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.564797 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.567086 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.567371 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.567384 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.567426 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.567439 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.568063 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.568082 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.568173 +INFO: TimeDuration, Event = Mul_end, Time = 0.000092 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.568187 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.568208 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.568221 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.568241 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.568258 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.568311 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.568434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.568456 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352810.568470 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352810.568557 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.229937, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.608760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.609659 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.609676 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.610507 +INFO: TimeDuration, Event = Relu_end, Time = 0.000831 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.625001 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.625911 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.625928 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.626781 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.626795 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.629726 +INFO: TimeDuration, Event = Pool_end, Time = 0.002931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.651570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.652041 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.652056 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.652490 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.669224 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.669699 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.669712 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.670146 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.670160 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.676852 +INFO: TimeDuration, Event = Pool_end, Time = 0.006692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.687027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.687323 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.687336 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.687563 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.696258 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.696563 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.696576 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.696802 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.708874 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.709176 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.709191 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.709416 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.709428 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.712169 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.718954 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.719292 +INFO: TimeDuration, Event = Add_end, Time = 0.000338 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.719306 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.719429 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.725344 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.725677 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.725691 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.725814 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.733796 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.734129 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.734143 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.734265 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.734282 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.737036 +INFO: TimeDuration, Event = Pool_end, Time = 0.002754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.739232 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.739516 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.739529 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.739572 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.741849 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.742138 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.742151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.742194 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.744457 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.744740 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.744752 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.744794 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.744807 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.745432 +INFO: TimeDuration, Event = Pool_end, Time = 0.000625 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.745451 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.745542 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.745557 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.745579 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.745590 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.745610 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.745626 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.745673 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.745686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.745705 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352810.745718 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352810.745806 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.839194, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.785901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.786805 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.786821 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.787649 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.802155 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.803058 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.803074 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.803927 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.803943 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.806881 +INFO: TimeDuration, Event = Pool_end, Time = 0.002938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.828725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.829194 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.829211 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.829644 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.846370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.846844 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.846856 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.847289 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.847302 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.853998 +INFO: TimeDuration, Event = Pool_end, Time = 0.006697 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.864170 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.864468 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.864481 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.864707 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.873408 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.873712 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.873725 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.873950 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.886027 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.886328 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.886342 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.886568 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.886581 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.889321 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.895911 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.896246 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.896259 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.896382 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.902311 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.902644 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.902658 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.902781 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.910440 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.910770 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.910784 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.910908 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.910925 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.913680 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.915867 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.916151 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.916164 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.916206 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.918495 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.918785 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.918801 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.918844 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.921115 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.921396 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.921411 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.921453 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.921466 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.922089 +INFO: TimeDuration, Event = Pool_end, Time = 0.000623 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.922107 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.922198 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.922213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.922234 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.922246 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.922265 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352810.922280 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352810.922327 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.922340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.922359 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352810.922372 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352810.922459 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.305551, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.962769 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.963663 +INFO: TimeDuration, Event = Add_end, Time = 0.000894 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.963678 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.964499 +INFO: TimeDuration, Event = Relu_end, Time = 0.000822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352810.979239 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352810.980136 +INFO: TimeDuration, Event = Add_end, Time = 0.000897 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352810.980155 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352810.981006 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352810.981021 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352810.983945 +INFO: TimeDuration, Event = Pool_end, Time = 0.002924 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.005785 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.006256 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.006273 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.006706 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.023437 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.023913 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.023939 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.024374 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.024439 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.031064 +INFO: TimeDuration, Event = Pool_end, Time = 0.006625 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.041238 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.041536 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.041549 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.041775 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.050472 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.050776 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.050790 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.051016 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.063086 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.063386 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.063399 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.063624 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.063637 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.066381 +INFO: TimeDuration, Event = Pool_end, Time = 0.002745 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.072968 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.073305 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.073319 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.073441 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.079668 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.080000 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.080013 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.080134 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.087790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.088120 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.088133 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.088256 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.088275 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.091031 +INFO: TimeDuration, Event = Pool_end, Time = 0.002756 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.093217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.093505 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.093519 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.093561 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.095843 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.096130 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.096143 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.096184 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.098462 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.098749 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.098762 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.098804 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.098815 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.099438 +INFO: TimeDuration, Event = Pool_end, Time = 0.000622 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.099456 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.099547 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.099561 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.099582 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.099594 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.099613 +INFO: TimeDuration, Event = Relu_end, Time = 0.000019 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.099627 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.099674 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.099687 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.099707 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352811.099720 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352811.099808 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.873337, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.139857 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.140762 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.140779 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.141605 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.156096 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.156998 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.157015 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.157869 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.157883 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.160823 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.182662 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.183133 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.183151 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.183587 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.200324 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.200799 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.200813 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.201249 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.201262 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.207947 +INFO: TimeDuration, Event = Pool_end, Time = 0.006684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.218108 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.218405 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.218417 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.218644 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.227362 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.227666 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.227679 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.227904 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.239981 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.240283 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.240295 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.240527 +INFO: TimeDuration, Event = Relu_end, Time = 0.000231 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.240539 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.243275 +INFO: TimeDuration, Event = Pool_end, Time = 0.002736 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.249882 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.250224 +INFO: TimeDuration, Event = Add_end, Time = 0.000343 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.250237 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.250364 +INFO: TimeDuration, Event = Relu_end, Time = 0.000127 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.256596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.256929 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.256943 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.257069 +INFO: TimeDuration, Event = Relu_end, Time = 0.000126 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.264720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.265049 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.265061 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.265183 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.265203 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.267965 +INFO: TimeDuration, Event = Pool_end, Time = 0.002762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.270223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.270511 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.270524 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.270566 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.272837 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.273124 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.273138 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.273180 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.275486 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.275772 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.275785 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.275828 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.275840 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.276431 +INFO: TimeDuration, Event = Pool_end, Time = 0.000591 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.276452 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.276548 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.276562 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.276586 +INFO: TimeDuration, Event = Add_end, Time = 0.000024 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.276598 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.276619 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.276633 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.276685 +INFO: TimeDuration, Event = Mul_end, Time = 0.000051 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.276698 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.276718 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352811.276733 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352811.276821 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000089 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.721860, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.316795 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.317704 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.317721 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.318543 +INFO: TimeDuration, Event = Relu_end, Time = 0.000823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.333054 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.333957 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.333973 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.334821 +INFO: TimeDuration, Event = Relu_end, Time = 0.000848 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.334836 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.337764 +INFO: TimeDuration, Event = Pool_end, Time = 0.002929 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.359606 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.360078 +INFO: TimeDuration, Event = Add_end, Time = 0.000472 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.360094 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.360530 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.377263 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.377740 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.377755 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.378192 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.378206 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.384892 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.395052 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.395349 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.395362 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.395587 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.404283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.404588 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.404604 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.404830 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.416899 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.417200 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.417214 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.417440 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.417467 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.420192 +INFO: TimeDuration, Event = Pool_end, Time = 0.002725 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.426790 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.427129 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.427143 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.427264 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.433510 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.433845 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.433859 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.433981 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.441633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.441962 +INFO: TimeDuration, Event = Add_end, Time = 0.000328 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.441975 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.442098 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.442116 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.444875 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.447074 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.447363 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.447377 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.447419 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.449705 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.449995 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.450008 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.450050 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.452439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.452720 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.452735 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.452777 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.452790 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.453313 +INFO: TimeDuration, Event = Pool_end, Time = 0.000524 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.453332 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.453426 +INFO: TimeDuration, Event = Mul_end, Time = 0.000094 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.453452 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.453474 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.453486 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.453505 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.453520 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.453569 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.453582 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.453601 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352811.453616 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352811.453703 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.579886, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.496692 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.497592 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.497607 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.498429 +INFO: TimeDuration, Event = Relu_end, Time = 0.000822 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.513017 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.513913 +INFO: TimeDuration, Event = Add_end, Time = 0.000896 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.513929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.514774 +INFO: TimeDuration, Event = Relu_end, Time = 0.000845 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.514788 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.517742 +INFO: TimeDuration, Event = Pool_end, Time = 0.002954 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.539564 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.540035 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.540051 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.540487 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.557217 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.557693 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.557705 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.558141 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.558154 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.564845 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.575008 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.575305 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.575318 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.575544 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.584241 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.584546 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.584559 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.584787 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.596854 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.597155 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.597169 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.597394 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.597427 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.600147 +INFO: TimeDuration, Event = Pool_end, Time = 0.002720 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.606746 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.607080 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.607107 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.607229 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.613470 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.613801 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.613826 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.613948 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.621886 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.622216 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.622229 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.622349 +INFO: TimeDuration, Event = Relu_end, Time = 0.000120 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.622366 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.625127 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.627338 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.627623 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.627637 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.627679 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.629949 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.630236 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.630249 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.630290 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.632571 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.632853 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.632865 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.632908 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.632920 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.633546 +INFO: TimeDuration, Event = Pool_end, Time = 0.000626 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.633564 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.633653 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.633667 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.633688 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.633701 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.633721 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.633736 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.633783 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.633796 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.633815 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352811.633830 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352811.633919 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000089 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.999585, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.674043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.674946 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.674961 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.675789 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.690295 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.691198 +INFO: TimeDuration, Event = Add_end, Time = 0.000904 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.691215 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.692066 +INFO: TimeDuration, Event = Relu_end, Time = 0.000851 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.692079 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.695022 +INFO: TimeDuration, Event = Pool_end, Time = 0.002944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.717912 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.718383 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.718399 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.718832 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.735283 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.735758 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.735781 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.736218 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.736232 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.742909 +INFO: TimeDuration, Event = Pool_end, Time = 0.006678 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.753112 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.753409 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.753421 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.753647 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.762474 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.762777 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.762791 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.763017 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.775084 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.775388 +INFO: TimeDuration, Event = Add_end, Time = 0.000303 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.775401 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.775628 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.775640 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.778380 +INFO: TimeDuration, Event = Pool_end, Time = 0.002740 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.784961 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.785295 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.785330 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.785453 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.791352 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.791683 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.791696 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.791818 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.799485 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.799818 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.799831 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.799953 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.799970 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.802725 +INFO: TimeDuration, Event = Pool_end, Time = 0.002755 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.804921 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.805206 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.805221 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.805263 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.807570 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.807855 +INFO: TimeDuration, Event = Add_end, Time = 0.000285 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.807868 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.807910 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.810227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.810511 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.810524 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.810566 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.810580 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.811198 +INFO: TimeDuration, Event = Pool_end, Time = 0.000618 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.811216 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.811313 +INFO: TimeDuration, Event = Mul_end, Time = 0.000097 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.811328 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.811350 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.811363 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.811383 +INFO: TimeDuration, Event = Relu_end, Time = 0.000021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.811399 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.811449 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.811463 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.811483 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352811.811498 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352811.811605 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000107 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 140.312273, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.852658 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.853554 +INFO: TimeDuration, Event = Add_end, Time = 0.000896 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.853570 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.854398 +INFO: TimeDuration, Event = Relu_end, Time = 0.000828 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.868890 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.869788 +INFO: TimeDuration, Event = Add_end, Time = 0.000898 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.869803 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.870656 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.870669 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.873616 +INFO: TimeDuration, Event = Pool_end, Time = 0.002947 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.896510 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.896981 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.896997 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.897430 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.913142 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.913615 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.913630 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.914061 +INFO: TimeDuration, Event = Relu_end, Time = 0.000432 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.914075 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.920757 +INFO: TimeDuration, Event = Pool_end, Time = 0.006682 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.930923 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.931220 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.931234 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.931461 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.940734 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.941038 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.941063 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.941289 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.953369 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.953670 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.953684 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.953911 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.953924 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.957701 +INFO: TimeDuration, Event = Pool_end, Time = 0.003777 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.963328 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.963664 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.963677 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.963798 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.969739 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.970072 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.970086 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.970208 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.977855 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.978184 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.978198 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.978321 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.978349 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.981098 +INFO: TimeDuration, Event = Pool_end, Time = 0.002749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.983299 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.983589 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.983602 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.983644 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.985922 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.986210 +INFO: TimeDuration, Event = Add_end, Time = 0.000288 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.986224 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.986267 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.988556 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.988840 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.988853 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.988896 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352811.988908 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352811.989527 +INFO: TimeDuration, Event = Pool_end, Time = 0.000619 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.989546 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.989636 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.989650 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.989671 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352811.989683 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352811.989703 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352811.989717 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352811.989764 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352811.989778 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352811.989797 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352811.989811 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352811.989897 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 68.199997 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.985990, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.030102 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.031001 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.031017 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.031840 +INFO: TimeDuration, Event = Relu_end, Time = 0.000823 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.046340 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.047239 +INFO: TimeDuration, Event = Add_end, Time = 0.000899 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.047254 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.048101 +INFO: TimeDuration, Event = Relu_end, Time = 0.000847 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.048115 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.051065 +INFO: TimeDuration, Event = Pool_end, Time = 0.002950 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.072904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.073375 +INFO: TimeDuration, Event = Add_end, Time = 0.000471 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.073390 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.073824 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.090572 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.091048 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.091062 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.091496 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.091507 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.098195 +INFO: TimeDuration, Event = Pool_end, Time = 0.006688 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.108370 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.108668 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.108680 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.108907 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.117611 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.117915 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.117929 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.118155 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.130223 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.130524 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.130538 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.130764 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.130776 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.133514 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.140106 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.140445 +INFO: TimeDuration, Event = Add_end, Time = 0.000339 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.140459 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.140581 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.146798 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.147130 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.147142 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.147265 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.154938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.155267 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.155280 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.155401 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.155420 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.158179 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.160442 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.160733 +INFO: TimeDuration, Event = Add_end, Time = 0.000291 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.160746 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.160787 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.162986 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.163275 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.163288 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.163330 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.165598 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.165881 +INFO: TimeDuration, Event = Add_end, Time = 0.000283 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.165895 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.165936 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.165948 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.166572 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.166594 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.166685 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.166699 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.166720 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.166731 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.166751 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.166765 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.166812 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.166826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.166846 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352812.166859 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352812.166946 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.628039, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.207211 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.208118 +INFO: TimeDuration, Event = Add_end, Time = 0.000907 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.208134 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.208960 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.223455 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.224371 +INFO: TimeDuration, Event = Add_end, Time = 0.000916 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.224385 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.225238 +INFO: TimeDuration, Event = Relu_end, Time = 0.000852 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.225252 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.228179 +INFO: TimeDuration, Event = Pool_end, Time = 0.002927 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.250026 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.250497 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.250513 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.250943 +INFO: TimeDuration, Event = Relu_end, Time = 0.000431 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.267717 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.268192 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.268206 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.268640 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.268654 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.275343 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.285522 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.285819 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.285832 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.286058 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.294754 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.295059 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.295073 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.295299 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.307371 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.307673 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.307687 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.307914 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.307926 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.310664 +INFO: TimeDuration, Event = Pool_end, Time = 0.002738 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.317250 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.317586 +INFO: TimeDuration, Event = Add_end, Time = 0.000336 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.317599 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.317721 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.323635 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.323969 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.323982 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.324104 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.331762 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.332093 +INFO: TimeDuration, Event = Add_end, Time = 0.000331 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.332106 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.332227 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.332246 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.335002 +INFO: TimeDuration, Event = Pool_end, Time = 0.002757 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.337201 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.337485 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.337498 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.337539 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.339813 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.340099 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.340112 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.340154 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.342445 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.342727 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.342740 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.342782 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.342795 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.343419 +INFO: TimeDuration, Event = Pool_end, Time = 0.000624 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.343437 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.343528 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.343541 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.343562 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.343587 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.343608 +INFO: TimeDuration, Event = Relu_end, Time = 0.000021 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.343623 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.343672 +INFO: TimeDuration, Event = Mul_end, Time = 0.000049 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.343685 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.343705 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352812.343720 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352812.343807 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000087 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.434836, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.383933 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.384833 +INFO: TimeDuration, Event = Add_end, Time = 0.000900 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.384851 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.385679 +INFO: TimeDuration, Event = Relu_end, Time = 0.000829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.400168 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.401070 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.401088 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.401943 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.401957 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.404896 +INFO: TimeDuration, Event = Pool_end, Time = 0.002938 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.426735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.427208 +INFO: TimeDuration, Event = Add_end, Time = 0.000473 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.427225 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.427659 +INFO: TimeDuration, Event = Relu_end, Time = 0.000434 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.444407 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.444884 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.444896 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.445330 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.445343 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.452033 +INFO: TimeDuration, Event = Pool_end, Time = 0.006690 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.462213 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.462511 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.462524 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.462752 +INFO: TimeDuration, Event = Relu_end, Time = 0.000228 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.471449 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.471753 +INFO: TimeDuration, Event = Add_end, Time = 0.000305 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.471766 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.471993 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.484061 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.484382 +INFO: TimeDuration, Event = Add_end, Time = 0.000321 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.484484 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.484709 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.484723 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.487356 +INFO: TimeDuration, Event = Pool_end, Time = 0.002633 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.493942 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.494276 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.494289 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.494413 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.500325 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.500662 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.500674 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.500797 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.508453 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.508782 +INFO: TimeDuration, Event = Add_end, Time = 0.000329 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.508794 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.508916 +INFO: TimeDuration, Event = Relu_end, Time = 0.000121 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.508933 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.511692 +INFO: TimeDuration, Event = Pool_end, Time = 0.002759 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.513901 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.514191 +INFO: TimeDuration, Event = Add_end, Time = 0.000289 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.514205 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.514246 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.516518 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.516810 +INFO: TimeDuration, Event = Add_end, Time = 0.000292 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.516823 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.516865 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.519158 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.519440 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.519454 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.519496 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.519509 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.520123 +INFO: TimeDuration, Event = Pool_end, Time = 0.000614 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.520146 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.520237 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.520252 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.520273 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.520285 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.520345 +INFO: TimeDuration, Event = Relu_end, Time = 0.000060 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.520359 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.520409 +INFO: TimeDuration, Event = Mul_end, Time = 0.000050 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.520419 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.520440 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352812.520455 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352812.520543 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.387179, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.560278 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.561187 +INFO: TimeDuration, Event = Add_end, Time = 0.000909 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.561205 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.562033 +INFO: TimeDuration, Event = Relu_end, Time = 0.000829 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.576528 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.577429 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.577444 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.578298 +INFO: TimeDuration, Event = Relu_end, Time = 0.000853 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.578312 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.581252 +INFO: TimeDuration, Event = Pool_end, Time = 0.002940 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.603092 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.603562 +INFO: TimeDuration, Event = Add_end, Time = 0.000470 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.603578 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.604011 +INFO: TimeDuration, Event = Relu_end, Time = 0.000433 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.620775 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.621251 +INFO: TimeDuration, Event = Add_end, Time = 0.000476 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.621264 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.621700 +INFO: TimeDuration, Event = Relu_end, Time = 0.000436 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.621724 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.628413 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.638575 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.638873 +INFO: TimeDuration, Event = Add_end, Time = 0.000298 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.638886 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.639112 +INFO: TimeDuration, Event = Relu_end, Time = 0.000227 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.647805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.648109 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.648122 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.648347 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.660422 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.660723 +INFO: TimeDuration, Event = Add_end, Time = 0.000302 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.660736 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.660963 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.660974 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.663715 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.670314 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.670649 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.670663 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.670785 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.677024 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.677356 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.677369 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.677491 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.685151 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.685483 +INFO: TimeDuration, Event = Add_end, Time = 0.000332 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.685498 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.685620 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.685638 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.688397 +INFO: TimeDuration, Event = Pool_end, Time = 0.002760 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.690585 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.690868 +INFO: TimeDuration, Event = Add_end, Time = 0.000282 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.690881 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.690923 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.693210 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.693525 +INFO: TimeDuration, Event = Add_end, Time = 0.000315 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.693538 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.693581 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.695834 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.696117 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.696131 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.696174 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.696187 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.696808 +INFO: TimeDuration, Event = Pool_end, Time = 0.000621 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.696827 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.696917 +INFO: TimeDuration, Event = Mul_end, Time = 0.000090 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.696931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.696952 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.696965 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.696985 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.697000 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.697047 +INFO: TimeDuration, Event = Mul_end, Time = 0.000047 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.697060 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.697079 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352812.697093 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352812.697178 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000086 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 66.000000 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.559036, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.739986 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.740887 +INFO: TimeDuration, Event = Add_end, Time = 0.000901 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.740905 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.741732 +INFO: TimeDuration, Event = Relu_end, Time = 0.000827 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.756230 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.757132 +INFO: TimeDuration, Event = Add_end, Time = 0.000902 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.757150 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.758005 +INFO: TimeDuration, Event = Relu_end, Time = 0.000855 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.758018 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.760959 +INFO: TimeDuration, Event = Pool_end, Time = 0.002941 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.782805 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.783274 +INFO: TimeDuration, Event = Add_end, Time = 0.000469 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.783292 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.783731 +INFO: TimeDuration, Event = Relu_end, Time = 0.000439 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.800465 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.800941 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.800953 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.801390 +INFO: TimeDuration, Event = Relu_end, Time = 0.000437 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.801403 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.808092 +INFO: TimeDuration, Event = Pool_end, Time = 0.006689 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.818255 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.818552 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.818565 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.818790 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.827493 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.827799 +INFO: TimeDuration, Event = Add_end, Time = 0.000306 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.827812 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.828038 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.842321 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.842629 +INFO: TimeDuration, Event = Add_end, Time = 0.000308 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.842642 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.842872 +INFO: TimeDuration, Event = Relu_end, Time = 0.000230 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.842885 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.845620 +INFO: TimeDuration, Event = Pool_end, Time = 0.002735 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.852208 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.852545 +INFO: TimeDuration, Event = Add_end, Time = 0.000337 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.852560 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.852684 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.858596 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.858930 +INFO: TimeDuration, Event = Add_end, Time = 0.000334 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.858944 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.859066 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.866724 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.867055 +INFO: TimeDuration, Event = Add_end, Time = 0.000330 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.867067 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.867189 +INFO: TimeDuration, Event = Relu_end, Time = 0.000122 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.867208 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.869966 +INFO: TimeDuration, Event = Pool_end, Time = 0.002758 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.872205 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.872492 +INFO: TimeDuration, Event = Add_end, Time = 0.000287 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.872507 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.872550 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.874815 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.875102 +INFO: TimeDuration, Event = Add_end, Time = 0.000286 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.875116 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.875157 +INFO: TimeDuration, Event = Relu_end, Time = 0.000041 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.877427 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.877708 +INFO: TimeDuration, Event = Add_end, Time = 0.000281 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.877721 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.877763 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.877776 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.878408 +INFO: TimeDuration, Event = Pool_end, Time = 0.000632 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.878427 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.878523 +INFO: TimeDuration, Event = Mul_end, Time = 0.000096 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.878536 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.878559 +INFO: TimeDuration, Event = Add_end, Time = 0.000022 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.878572 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.878591 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352812.878605 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352812.878658 +INFO: TimeDuration, Event = Mul_end, Time = 0.000053 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.878672 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.878692 +INFO: TimeDuration, Event = Add_end, Time = 0.000020 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352812.878705 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352812.878797 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000092 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 64.599998 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 141.578748, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +DEBUG: ***--- size_in_bytes = 6144000 +DEBUG: Attempting to Allocate = 6144000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 3072, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: Moving 6144000 bytes from host to GPU +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.919087 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.919992 +INFO: TimeDuration, Event = Add_end, Time = 0.000905 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.920008 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.920834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000826 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 131072000 +DEBUG: Attempting to Allocate = 131072000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 65536, cStride = 1024, hStride = 32, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.935374 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 32768000 +INFO: bias->num_elems = 64 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.936277 +INFO: TimeDuration, Event = Add_end, Time = 0.000903 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.936293 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.937155 +INFO: TimeDuration, Event = Relu_end, Time = 0.000862 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.937171 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 64, h = 16, w = 16 , dim1 = 32 , dim2 = 32 +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.940102 +INFO: TimeDuration, Event = Pool_end, Time = 0.002931 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.961944 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.962418 +INFO: TimeDuration, Event = Add_end, Time = 0.000474 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.962435 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.962885 +INFO: TimeDuration, Event = Relu_end, Time = 0.000450 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 65536000 +DEBUG: Attempting to Allocate = 65536000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 32768, cStride = 256, hStride = 16, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.979660 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 16384000 +INFO: bias->num_elems = 128 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.980135 +INFO: TimeDuration, Event = Add_end, Time = 0.000475 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.980148 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.980583 +INFO: TimeDuration, Event = Relu_end, Time = 0.000435 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352812.980597 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 128, h = 8, w = 8 , dim1 = 16 , dim2 = 16 +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352812.987283 +INFO: TimeDuration, Event = Pool_end, Time = 0.006686 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352812.997448 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352812.997746 +INFO: TimeDuration, Event = Add_end, Time = 0.000297 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352812.997758 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352812.997983 +INFO: TimeDuration, Event = Relu_end, Time = 0.000225 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.006684 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.006988 +INFO: TimeDuration, Event = Add_end, Time = 0.000304 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.007002 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.007227 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 32768000 +DEBUG: Attempting to Allocate = 32768000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 16384, cStride = 64, hStride = 8, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.019305 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 8192000 +INFO: bias->num_elems = 256 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.019606 +INFO: TimeDuration, Event = Add_end, Time = 0.000301 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.019619 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.019845 +INFO: TimeDuration, Event = Relu_end, Time = 0.000226 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352813.019858 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 256, h = 4, w = 4 , dim1 = 8 , dim2 = 8 +DEBUG: ***--- size_in_bytes = 8192000 +DEBUG: Attempting to Allocate = 8192000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 4096, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352813.022599 +INFO: TimeDuration, Event = Pool_end, Time = 0.002741 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.029192 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.029527 +INFO: TimeDuration, Event = Add_end, Time = 0.000335 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.029540 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.029663 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.035904 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.036237 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.036252 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.036375 +INFO: TimeDuration, Event = Relu_end, Time = 0.000124 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 16384000 +DEBUG: Attempting to Allocate = 16384000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 8192, cStride = 16, hStride = 4, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.044040 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 4096000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.044374 +INFO: TimeDuration, Event = Add_end, Time = 0.000333 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.044385 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.044508 +INFO: TimeDuration, Event = Relu_end, Time = 0.000123 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352813.044527 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 2, w = 2 , dim1 = 4 , dim2 = 4 +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352813.047280 +INFO: TimeDuration, Event = Pool_end, Time = 0.002752 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.049495 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.049779 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.049792 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.049834 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.052123 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.052413 +INFO: TimeDuration, Event = Add_end, Time = 0.000290 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.052428 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.052470 +INFO: TimeDuration, Event = Relu_end, Time = 0.000043 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for ConvLayer +*** Convolution + ApproxChoice = 2 + BatchNorm = 1 + CONV = 2 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: FP32 BASELINE +DEBUG: ***--- size_in_bytes = 4096000 +DEBUG: Attempting to Allocate = 4096000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 2048, cStride = 4, hStride = 2, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.054749 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 1024000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.055033 +INFO: TimeDuration, Event = Add_end, Time = 0.000284 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.055046 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.055088 +INFO: TimeDuration, Event = Relu_end, Time = 0.000042 +INFO: *** TensorPooling +INFO: AbsoluteTime, Event = Pool, Time = 1607352813.055100 +DEBUG: No data movement required - Data on Device +DEBUG: n = 500, c = 512, h = 1, w = 1 , dim1 = 2 , dim2 = 2 +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +INFO: AbsoluteTime, Event = Pool_end, Time = 1607352813.055708 +INFO: TimeDuration, Event = Pool_end, Time = 0.000607 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352813.055726 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 512, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 1024000 +DEBUG: Attempting to Allocate = 1024000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 512, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352813.055817 +INFO: TimeDuration, Event = Mul_end, Time = 0.000091 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.055832 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 256000 +INFO: bias->num_elems = 512 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.055853 +INFO: TimeDuration, Event = Add_end, Time = 0.000021 +INFO: *** TensorRelu +INFO: AbsoluteTime, Event = Relu, Time = 1607352813.055866 +DEBUG: No data movement required - Data on Device +INFO: AbsoluteTime, Event = Relu_end, Time = 1607352813.055886 +INFO: TimeDuration, Event = Relu_end, Time = 0.000020 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: GPU Configuration for FCLayer +INFO: *** TensorGemmGPU +INFO: AbsoluteTime, Event = Mul, Time = 1607352813.055900 +INFO: rhs->dims.num_dims = 4 +INFO: lhs->dims.num_dims = 4 +INFO: m = 500, n = 100, k = 512 +DEBUG: Creating new TENSOR * +DEBUG: ***--- size_in_bytes = 200000 +DEBUG: Attempting to Allocate = 200000 + + +DEBUG: tensor->data_format = 0 +INFO: nStride = 100, cStride = 1, hStride = 1, wStride = 1 +DEBUG: tensor->data_format = 0 +DEBUG: Changing placement * +DEBUG: Changed Placement * + +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +DEBUG: CuBlasSgemm * +INFO: AbsoluteTime, Event = Mul_end, Time = 1607352813.055948 +INFO: TimeDuration, Event = Mul_end, Time = 0.000048 +INFO: *** TensorAdd +INFO: AbsoluteTime, Event = Add, Time = 1607352813.055961 +DEBUG: No data movement required - Data on Device +DEBUG: No data movement required - Data on Device +INFO: x->num_elems = 50000 +INFO: bias->num_elems = 100 +INFO: AbsoluteTime, Event = Add_end, Time = 1607352813.055980 +INFO: TimeDuration, Event = Add_end, Time = 0.000019 +INFO: No activation Function +DEBUG: No data movement required - Data on Device +INFO: *** TensorSoftmax +INFO: AbsoluteTime, Event = Softmax, Time = 1607352813.056017 +DEBUG: No data movement required - Data on Device +INFO: Moving 200000 bytes from GPU to host +INFO: AbsoluteTime, Event = Softmax_end, Time = 1607352813.056104 +INFO: TimeDuration, Event = Softmax_end, Time = 0.000088 +DEBUG: No data movement required - Data on Host +batch_dim = 500, num_classes = 100 +****** Accuracy = 67.800003 + +DEBUG: findNextConfiguration: Updated configurationIdx to 1. +DEBUG: findTargetConfiguration: goalVal: 0.000000, search kind: 2. +DEBUG: findTargetConfiguration: Updated configurationIdx to 0. +INFO: current iteration time = 139.828359, current iteration energy = 0.000000 + +DEBUG: **** Freeing Ouput Tensors *** +Exiting profiler +INFO: Writing Runtime Profile Info File... +INFO: Done writing profile. diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/out-run-1 b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/out-run-1 new file mode 100644 index 0000000000000000000000000000000000000000..f4859d78ec7d392ece533bb2ee87858132c60f3b --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/out-run-1 @@ -0,0 +1 @@ +run_dnn_frequency_exp.sh: line 28: ./vgg16_cifar100_loop_wrapperapi_linked: No such file or directory diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/predictive/vgg16_cifar100.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/predictive/vgg16_cifar100.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c29bedd096aec2c7f66afbe729353e372fac403 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_cifar100/predictive/vgg16_cifar100.txt @@ -0,0 +1,970 @@ +3768.819777999999 ++++++ +conf1 1 1 66.5 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 11 add fp32 1 relu fp32 1 +9 gpu conv fp32 11 add fp32 1 relu fp32 1 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 11 add fp32 1 relu fp32 1 +12 gpu conv fp32 11 add fp32 1 relu fp32 1 +13 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 11 add fp32 1 relu fp32 1 +15 gpu mul fp32 11 add fp32 1 +16 gpu softmax fp32 1 +----- ++++++ +conf2 2.2877724452131787 2.08025704453875 66.45 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 153 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf3 2.5314658805383816 2.30737681453141 66.45 0.04999999999999716 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf4 2.044123178914057 1.8616966918258782 66.32000000000001 0.1799999999999926 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf5 2.231179358259141 2.0317825813373864 66.18 0.3199999999999932 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf6 2.2474834421641057 2.0338639876373272 65.88000000000001 0.6199999999999903 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf7 2.22281439516094 2.0205460706906377 65.88000000000001 0.6199999999999903 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +12 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf8 2.1625085012968484 1.94560449637282 65.88000000000001 0.6199999999999903 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv fp16 11 add fp16 1 relu fp16 1 +10 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf9 2.639337323402163 2.3960416499256825 65.8 0.7000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf10 2.672718090670276 2.4276905528801507 65.68 0.8199999999999932 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf11 2.699089631751789 2.446114054498494 65.68 0.8199999999999932 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf12 2.6003752638648767 2.3553067802112344 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf13 2.638763904718665 2.395072565223988 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf14 2.6003752638648767 2.3553067802112344 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf15 2.6003752638648767 2.3553067802112344 65.64 0.8599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf16 2.6732183804279006 2.4287517162140326 65.62 0.8799999999999955 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf17 2.6728394017929027 2.428768169588016 65.60000000000001 0.8999999999999915 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf18 2.4549989178389238 2.2406620346549433 65.56 0.9399999999999977 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf19 2.673556689244081 2.429092581627209 65.52 0.980000000000004 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf20 2.6525635304451756 2.406830663552284 65.5 1.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf21 2.6692288605087553 2.423462800937785 65.5 1.0 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf22 2.583650505571873 2.3471533059252194 65.48 1.019999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf23 2.6474572655420125 2.400471260394867 65.48 1.019999999999996 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf24 2.4710116424304736 2.2555966923178996 65.46 1.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf25 2.557911102074785 2.3292661683311526 65.46 1.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf26 2.6032957018479532 2.367574146141511 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 163 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf27 2.6029968728098916 2.3672068592437223 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf28 2.602540311129756 2.3691028781436954 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf29 2.602756708588441 2.3708111025211718 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf30 2.603240857443844 2.3662875785790183 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf31 2.602882717372841 2.368011704225619 65.44 1.0600000000000023 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf32 2.67999343314603 2.4305182001043826 65.4 1.0999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf33 2.670314990364046 2.4275308713267485 65.38000000000001 1.1199999999999903 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf34 2.650982630033638 2.405821467700663 65.36 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf35 2.6507266317871756 2.405938171802741 65.36 1.1400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf36 2.6523068534836174 2.406695716686769 65.34 1.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf37 2.6533198495191073 2.4077689394073865 65.34 1.1599999999999966 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf38 2.64630900155657 2.4073892305914986 65.32 1.1800000000000068 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf39 2.6725522534379413 2.42903505877629 65.32 1.1800000000000068 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf40 2.6435249267602225 2.403536258709464 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 161 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf41 2.6442059720503557 2.4037376163252024 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf42 2.6536933126724027 2.4077527693156053 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf43 2.6442798101298948 2.4056031584129225 65.3 1.2000000000000028 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf44 2.603921271336049 2.3665955131107683 65.28 1.2199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf45 2.4967248028856828 2.2748997625822716 65.25999999999999 1.240000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf46 2.4963953691980665 2.2764932409573166 65.25999999999999 1.240000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf47 2.678944927989822 2.4251978482969956 65.24 1.2600000000000051 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 264 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf48 2.6727135417173904 2.428897140422096 65.22 1.2800000000000011 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf49 2.600256135586627 2.355428067042657 65.16 1.3400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv fp16 11 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf50 2.264460006128871 2.058037581586567 64.9 1.5999999999999943 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 269 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 164 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 263 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- ++++++ +conf51 2.2817447204106736 2.0758846029697513 64.84 1.6599999999999966 +1 gpu conv fp16 11 add fp16 1 relu fp16 1 +2 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 +4 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +9 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 265 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 +16 gpu softmax fp32 1 +----- diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_imagenet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_imagenet/Makefile index 067491b1c1d70ce37409da67d8afc6cf46cf345c..8875f2eddf39493321fb706b9eec45880819da16 100644 --- a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_imagenet/Makefile +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_imagenet/Makefile @@ -1,4 +1,6 @@ DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks +# NOTE: can configure build directory +#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/ HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT) CC = $(HPVM_BUILD_DIR)/bin/clang++ @@ -20,7 +22,7 @@ TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_au CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 CCFLAGS += -DDEVICE=CUDNN_TARGET -LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp +LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs -lomp HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib @@ -33,12 +35,16 @@ PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(A VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-promise -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt -CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs_base.txt - +CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG +TEMP_CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/predictive/temp.txt + + +VISC_PRED_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(TEMP_CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + TARGET = $(BUILD_DIR)/$(APP).opt.bc SOURCES = $(SRC_DIR)/$(APP).cpp VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll @@ -50,24 +56,32 @@ default: $(BUILD_DIR) $(TARGET) $(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp $(CC) $(CC_FLAGS) -emit-llvm src/$(APP).cpp -S -o $(BUILD_DIR)/$(APP).ll - #-- $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll + $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_loop.cpp -S -o $(BUILD_DIR)/$(APP)_loop.ll + $(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP).ll -S -o $(BUILD_DIR)/$(APP).visc.ll - #-- $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll + $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc - #-- $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc + #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc + $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc - + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc + $(OPT) $(VISC_PRED_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc - #-- $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc + #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) - #- $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) - + $(CC) $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_wrapperapi_linked $(LINKER_FLAGS) + $(CC) $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_pred_loop_wrapperapi_linked $(LINKER_FLAGS) $(BUILD_DIR): mkdir -p $@ diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_imagenet/predictive/vgg16_imagenet.txt b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_imagenet/predictive/vgg16_imagenet.txt new file mode 100644 index 0000000000000000000000000000000000000000..108a101c810f4ebe488e6f2029be4d970d7869a2 --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/vgg16_imagenet/predictive/vgg16_imagenet.txt @@ -0,0 +1,561 @@ +19194.623482 ++++++ +conf1 1 1 72.84 0.0 +1 gpu conv fp32 11 add fp32 1 relu fp32 1 +2 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +3 gpu conv fp32 11 add fp32 1 relu fp32 1 +4 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +5 gpu conv fp32 11 add fp32 1 relu fp32 1 +6 gpu conv fp32 11 add fp32 1 relu fp32 1 +7 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +8 gpu conv fp32 11 add fp32 1 relu fp32 1 +9 gpu conv fp32 11 add fp32 1 relu fp32 1 +10 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +11 gpu conv fp32 11 add fp32 1 relu fp32 1 +12 gpu conv fp32 11 add fp32 1 relu fp32 1 +13 gpu conv fp32 11 add fp32 1 relu fp32 1 pool_max fp32 1 +14 gpu mul fp32 11 add fp32 1 relu fp32 1 +15 gpu mul fp32 11 add fp32 1 relu fp32 1 +16 gpu mul fp32 11 add fp32 1 +17 gpu softmax fp32 1 +----- ++++++ +conf2 2.0787477568568082 1.7725701909562666 72.76 0.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf3 2.2877881266029436 1.9268677640464096 72.04 0.7999999999999972 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf4 2.493698381711785 2.0336802939709626 72.02 0.8200000000000074 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf5 2.164723960411776 1.8442442134020163 71.94 0.9000000000000057 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf6 2.53794461743687 2.069640641367895 71.67999999999999 1.1600000000000108 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf7 1.7943268128686711 1.6103705347377417 71.58 1.2600000000000051 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf8 1.8143284638396158 1.6288620764171362 71.5 1.3400000000000034 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv fp16 12 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf9 2.5462742331906263 2.076061630349781 71.48 1.3599999999999994 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf10 2.526515422129153 2.063839193109964 71.39999999999999 1.440000000000012 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf11 2.1596661517243856 1.8351710968407349 71.34 1.5 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 267 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 268 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 156 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf12 2.3444383477958337 1.981259839350623 71.22 1.6200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf13 1.8402020049200172 1.652343405000522 71.2 1.6400000000000006 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +13 gpu conv fp16 11 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf14 2.6420417968257306 2.167425635999969 71.12 1.7199999999999989 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 155 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf15 2.543198098440602 2.0805826545876145 71.1 1.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf16 2.6224991911009328 2.1476958232678807 70.89999999999999 1.940000000000012 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf17 2.5978010917593752 2.131515210392801 70.8 2.0400000000000063 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 157 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf18 2.623210258119482 2.156636511928761 70.76 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf19 2.598187894495609 2.1322228990374104 70.76 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf20 2.640464221374653 2.1682626030871295 70.76 2.0799999999999983 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 167 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf21 2.659563405662692 2.1881035849678936 70.54 2.299999999999997 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf22 2.636584103560761 2.1652496021557557 70.39999999999999 2.440000000000012 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 165 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf23 2.6315080449303547 2.161259580137757 70.38 2.460000000000008 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 162 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf24 2.7367939789033153 2.263326406058847 70.34 2.5 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 160 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf25 2.712182817327382 2.2404693918737233 70.24000000000001 2.5999999999999943 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 168 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf26 2.660510795888948 2.187299344706456 70.22 2.6200000000000045 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +9 gpu conv fp16 12 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf27 2.457573203839654 2.0936930776435383 70.1 2.740000000000009 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv fp16 12 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +10 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +----- ++++++ +conf28 2.7452293174567757 2.2593302388139347 69.92 2.9200000000000017 +1 gpu conv fp16 12 add fp16 1 relu fp16 1 +2 gpu conv samp_fp16 262 add fp16 1 relu fp16 1 pool_max fp16 1 +3 gpu conv fp16 12 add fp16 1 relu fp16 1 +4 gpu conv perf_fp16 159 add fp16 1 relu fp16 1 pool_max fp16 1 +5 gpu conv fp16 12 add fp16 1 relu fp16 1 +6 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +7 gpu conv samp_fp16 266 add fp16 1 relu fp16 1 pool_max fp16 1 +8 gpu conv fp16 12 add fp16 1 relu fp16 1 +9 gpu conv samp_fp16 261 add fp16 1 relu fp16 1 +10 gpu conv perf_fp16 152 add fp16 1 relu fp16 1 pool_max fp16 1 +11 gpu conv fp16 12 add fp16 1 relu fp16 1 +12 gpu conv fp16 12 add fp16 1 relu fp16 1 +13 gpu conv perf_fp16 151 add fp16 1 relu fp16 1 pool_max fp16 1 +14 gpu mul fp16 12 add fp16 1 relu fp16 1 +15 gpu mul fp16 12 add fp16 1 relu fp16 1 +16 gpu mul fp16 12 add fp16 1 +17 gpu softmax fp32 1 +-----